diff --git a/.gitignore b/.gitignore index e4945e0..366febb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .ipynb_checkpoints __pycache__ prepro/**/* -tags.json \ No newline at end of file +tags.json +model.zip \ No newline at end of file diff --git a/Training.ipynb b/Training.ipynb index 721a316..dfe354d 100644 --- a/Training.ipynb +++ b/Training.ipynb @@ -724,7 +724,7 @@ { "cell_type": "code", "execution_count": 36, - "id": "2a35055b", + "id": "bd9b45b3", "metadata": {}, "outputs": [ { @@ -747,7 +747,7 @@ { "cell_type": "code", "execution_count": 39, - "id": "778c99b7", + "id": "3d62b34a", "metadata": {}, "outputs": [ { @@ -770,7 +770,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "798091aa", + "id": "c42f4980", "metadata": {}, "outputs": [ { @@ -1021,7 +1021,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "78e46670", + "id": "9664aa89", "metadata": {}, "outputs": [ { @@ -1042,7 +1042,7 @@ }, { "cell_type": "markdown", - "id": "ed61ce06", + "id": "2404426f", "metadata": {}, "source": [ "`groupby_index` 그룹으로 묶어서 실행" @@ -1674,7 +1674,7 @@ }, { "cell_type": "markdown", - "id": "4830938c", + "id": "8c02816f", "metadata": {}, "source": [ "Outside 토큰에 해당하는 곳을 짜르겠습니다." @@ -1806,10 +1806,128 @@ " print(f\"class {i} f1 score : {f1}\")" ] }, + { + "cell_type": "markdown", + "id": "8da8a232", + "metadata": {}, + "source": [ + "nan 나온 것에 대해서 생각해보자." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "9747e8de", + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "import collections" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "676f8f16", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5000it [00:00, 90912.13it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "token \t count frequency%\n", + "O(21) \t 174832 79.596%\n", + "I-PS(17) \t 12555 5.716%\n", + "I-OG(16) \t 10927 4.975%\n", + "B-PS(7) \t 4726 2.152%\n", + "I-DT(14) \t 4407 2.006%\n", + "B-OG(6) \t 3782 1.722%\n", + "I-LC(15) \t 2365 1.077%\n", + "B-DT(4) \t 2338 1.064%\n", + "B-LC(5) \t 2217 1.009%\n", + "I-TI(18) \t 1030 0.469%\n", + "B-TI(8) \t 397 0.181%\n", + "I-목소(19) \t 32 0.015%\n", + "I-(11) \t 15 0.007%\n", + "I-조선(20) \t 8 0.004%\n", + "I-1(12) \t 5 0.002%\n", + "B-(1) \t 4 0.002%\n", + "I-<휠(13) \t 4 0.002%\n", + "B-조선(10) \t 1 0.000%\n", + "B-목소(9) \t 1 0.000%\n", + "B-<휠(3) \t 1 0.000%\n", + "B-1(2) \t 1 0.000%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "tagIdConverter = TagIdConverter()\n", + "counter = collections.Counter()\n", + "total_l = 0\n", + "\n", + "for item in tqdm(itertools.chain(datasetTrain,datasetDev,datasetTest)):\n", + " entities = item[\"entity\"]\n", + " for entity in entities:\n", + " counter[entity] += 1\n", + " total_l += len(entities)\n", + "print(f\"{'token':<12}\\t{'count':>12} {'frequency%':>12}\")\n", + "for token,count in counter.most_common():\n", + " tid = tagIdConverter.convert_tokens_to_ids([token])[0]\n", + " print(f\"{f'{token}({tid})':<12}\\t{count:>12}{count*100/total_l:>12.3f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "6c4191a3", + "metadata": {}, + "source": [ + "19, 11, 20, 12, 1, 13, 10, 9, 3, 2 번은 데이터 규모에 비해서 유의미한 데이터가 아니다. 샘플이 너무 적어서 학습하기에 부적절하다." + ] + }, + { + "cell_type": "markdown", + "id": "f86eeca1", + "metadata": {}, + "source": [ + "모델을 저장해보자" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "4ff86a28", + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"model.zip\")" + ] + }, + { + "cell_type": "markdown", + "id": "0db7cd17", + "metadata": {}, + "source": [ + "다음과 같이 하면 저장됨." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "0b9b55e7", + "id": "1ea835d4", "metadata": {}, "outputs": [], "source": []