Compare commits

..

No commits in common. "92205ea7952fe2bf849e1ca5143fddfc9e0da6ab" and "d2256b0ee949e7cb054f85386822633f3dd4a1e4" have entirely different histories.

4 changed files with 30 additions and 150 deletions

3
.gitignore vendored
View File

@ -1,5 +1,4 @@
.ipynb_checkpoints .ipynb_checkpoints
__pycache__ __pycache__
prepro/**/* prepro/**/*
tags.json tags.json
model.zip

View File

@ -724,7 +724,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 36,
"id": "bd9b45b3", "id": "2a35055b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -747,7 +747,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 39,
"id": "3d62b34a", "id": "778c99b7",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -770,7 +770,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": 38,
"id": "c42f4980", "id": "798091aa",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1021,7 +1021,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 27,
"id": "9664aa89", "id": "78e46670",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1042,7 +1042,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "2404426f", "id": "ed61ce06",
"metadata": {}, "metadata": {},
"source": [ "source": [
"`groupby_index` 그룹으로 묶어서 실행" "`groupby_index` 그룹으로 묶어서 실행"
@ -1674,7 +1674,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "8c02816f", "id": "4830938c",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Outside 토큰에 해당하는 곳을 짜르겠습니다." "Outside 토큰에 해당하는 곳을 짜르겠습니다."
@ -1806,128 +1806,10 @@
" print(f\"class {i} f1 score : {f1}\")" " print(f\"class {i} f1 score : {f1}\")"
] ]
}, },
{
"cell_type": "markdown",
"id": "8da8a232",
"metadata": {},
"source": [
"nan 나온 것에 대해서 생각해보자."
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "9747e8de",
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import collections"
]
},
{
"cell_type": "code",
"execution_count": 120,
"id": "676f8f16",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"5000it [00:00, 90912.13it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"token \t count frequency%\n",
"O(21) \t 174832 79.596%\n",
"I-PS(17) \t 12555 5.716%\n",
"I-OG(16) \t 10927 4.975%\n",
"B-PS(7) \t 4726 2.152%\n",
"I-DT(14) \t 4407 2.006%\n",
"B-OG(6) \t 3782 1.722%\n",
"I-LC(15) \t 2365 1.077%\n",
"B-DT(4) \t 2338 1.064%\n",
"B-LC(5) \t 2217 1.009%\n",
"I-TI(18) \t 1030 0.469%\n",
"B-TI(8) \t 397 0.181%\n",
"I-목소(19) \t 32 0.015%\n",
"I-(11) \t 15 0.007%\n",
"I-조선(20) \t 8 0.004%\n",
"I-1(12) \t 5 0.002%\n",
"B-(1) \t 4 0.002%\n",
"I-<휠(13) \t 4 0.002%\n",
"B-조선(10) \t 1 0.000%\n",
"B-목소(9) \t 1 0.000%\n",
"B-<휠(3) \t 1 0.000%\n",
"B-1(2) \t 1 0.000%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"tagIdConverter = TagIdConverter()\n",
"counter = collections.Counter()\n",
"total_l = 0\n",
"\n",
"for item in tqdm(itertools.chain(datasetTrain,datasetDev,datasetTest)):\n",
" entities = item[\"entity\"]\n",
" for entity in entities:\n",
" counter[entity] += 1\n",
" total_l += len(entities)\n",
"print(f\"{'token':<12}\\t{'count':>12} {'frequency%':>12}\")\n",
"for token,count in counter.most_common():\n",
" tid = tagIdConverter.convert_tokens_to_ids([token])[0]\n",
" print(f\"{f'{token}({tid})':<12}\\t{count:>12}{count*100/total_l:>12.3f}%\")"
]
},
{
"cell_type": "markdown",
"id": "6c4191a3",
"metadata": {},
"source": [
"19, 11, 20, 12, 1, 13, 10, 9, 3, 2 번은 데이터 규모에 비해서 유의미한 데이터가 아니다. 샘플이 너무 적어서 학습하기에 부적절하다."
]
},
{
"cell_type": "markdown",
"id": "f86eeca1",
"metadata": {},
"source": [
"모델을 저장해보자"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "4ff86a28",
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), \"model.zip\")"
]
},
{
"cell_type": "markdown",
"id": "0db7cd17",
"metadata": {},
"source": [
"다음과 같이 하면 저장됨."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "1ea835d4", "id": "0b9b55e7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []

22
count_o.py Normal file
View File

@ -0,0 +1,22 @@
from dataset import readPreporcssedDataAll
from tqdm import tqdm
"""
count outside tokens(O tokens)
"""
if __name__ == "__main__":
train, _, _ = readPreporcssedDataAll()
total_l = 0
total_o = 0
for item in tqdm(train):
entities = item["entity"]
l = len(entities)
o = sum(map(lambda x: 1 if x == "O" else 0,entities))
total_l += l
total_o += o
print(f"{total_o}/{total_l} = {total_o/total_l}")

View File

@ -1,23 +0,0 @@
from dataset import readPreporcssedDataAll
from read_data import TagIdConverter
from tqdm import tqdm
from collections import Counter
"""
get frequency of tokens
"""
if __name__ == "__main__":
train, _, _ = readPreporcssedDataAll()
tagIdConverter = TagIdConverter()
counter = Counter()
total_l = 0
for item in tqdm(train):
entities = item["entity"]
for entity in entities:
counter[entity] += 1
total_l += len(entities)
print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
for token,count in counter.most_common():
tid = tagIdConverter.convert_tokens_to_ids([token])[0]
print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")