feat: add count freq and save model

feat: count tag frequency
2022-02-18 18:33:51 +09:00 · 2022-02-18 18:31:53 +09:00
4 changed files with 150 additions and 30 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@
 __pycache__
 prepro/**/*
 tags.json
+model.zip
--- a/Training.ipynb
+++ b/Training.ipynb
@ -724,7 +724,7 @@
  {
   "cell_type": "code",
   "execution_count": 36,
-   "id": "2a35055b",
+   "id": "bd9b45b3",
   "metadata": {},
   "outputs": [
    {
@ -747,7 +747,7 @@
  {
   "cell_type": "code",
   "execution_count": 39,
-   "id": "778c99b7",
+   "id": "3d62b34a",
   "metadata": {},
   "outputs": [
    {
@ -770,7 +770,7 @@
  {
   "cell_type": "code",
   "execution_count": 38,
-   "id": "798091aa",
+   "id": "c42f4980",
   "metadata": {},
   "outputs": [
    {
@ -1021,7 +1021,7 @@
  {
   "cell_type": "code",
   "execution_count": 27,
-   "id": "78e46670",
+   "id": "9664aa89",
   "metadata": {},
   "outputs": [
    {
@ -1042,7 +1042,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ed61ce06",
+   "id": "2404426f",
   "metadata": {},
   "source": [
    "`groupby_index` 그룹으로 묶어서 실행"
@ -1674,7 +1674,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "4830938c",
+   "id": "8c02816f",
   "metadata": {},
   "source": [
    "Outside 토큰에 해당하는 곳을 짜르겠습니다."
@ -1806,10 +1806,128 @@
    "    print(f\"class {i} f1 score : {f1}\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "8da8a232",
+   "metadata": {},
+   "source": [
+    "nan 나온 것에 대해서 생각해보자."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "9747e8de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "import collections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "id": "676f8f16",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "5000it [00:00, 90912.13it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "token       \t       count   frequency%\n",
+      "O(21)       \t      174832      79.596%\n",
+      "I-PS(17)    \t       12555       5.716%\n",
+      "I-OG(16)    \t       10927       4.975%\n",
+      "B-PS(7)     \t        4726       2.152%\n",
+      "I-DT(14)    \t        4407       2.006%\n",
+      "B-OG(6)     \t        3782       1.722%\n",
+      "I-LC(15)    \t        2365       1.077%\n",
+      "B-DT(4)     \t        2338       1.064%\n",
+      "B-LC(5)     \t        2217       1.009%\n",
+      "I-TI(18)    \t        1030       0.469%\n",
+      "B-TI(8)     \t         397       0.181%\n",
+      "I-목소(19)    \t          32       0.015%\n",
+      "I-(11)      \t          15       0.007%\n",
+      "I-조선(20)    \t           8       0.004%\n",
+      "I-1(12)     \t           5       0.002%\n",
+      "B-(1)       \t           4       0.002%\n",
+      "I-<휠(13)    \t           4       0.002%\n",
+      "B-조선(10)    \t           1       0.000%\n",
+      "B-목소(9)     \t           1       0.000%\n",
+      "B-<휠(3)     \t           1       0.000%\n",
+      "B-1(2)      \t           1       0.000%\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tagIdConverter = TagIdConverter()\n",
+    "counter = collections.Counter()\n",
+    "total_l = 0\n",
+    "\n",
+    "for item in tqdm(itertools.chain(datasetTrain,datasetDev,datasetTest)):\n",
+    "    entities = item[\"entity\"]\n",
+    "    for entity in entities:\n",
+    "        counter[entity] += 1\n",
+    "    total_l += len(entities)\n",
+    "print(f\"{'token':<12}\\t{'count':>12} {'frequency%':>12}\")\n",
+    "for token,count in counter.most_common():\n",
+    "    tid = tagIdConverter.convert_tokens_to_ids([token])[0]\n",
+    "    print(f\"{f'{token}({tid})':<12}\\t{count:>12}{count*100/total_l:>12.3f}%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c4191a3",
+   "metadata": {},
+   "source": [
+    "19, 11, 20, 12, 1, 13, 10, 9, 3, 2 번은 데이터 규모에 비해서 유의미한 데이터가 아니다. 샘플이 너무 적어서 학습하기에 부적절하다."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f86eeca1",
+   "metadata": {},
+   "source": [
+    "모델을 저장해보자"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "id": "4ff86a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.state_dict(), \"model.zip\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0db7cd17",
+   "metadata": {},
+   "source": [
+    "다음과 같이 하면 저장됨."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0b9b55e7",
+   "id": "1ea835d4",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/count_o.py
+++ b/count_o.py
@ -1,22 +0,0 @@
-from dataset import readPreporcssedDataAll
-from tqdm import tqdm
-
-
-"""
-count outside tokens(O tokens)
-"""
-if __name__ == "__main__":
-
-    train, _, _ = readPreporcssedDataAll()
-
-    total_l = 0
-    total_o = 0
-
-    for item in tqdm(train):
-        entities = item["entity"]
-        l = len(entities)
-        o = sum(map(lambda x: 1 if x == "O" else 0,entities))
-        total_l += l
-        total_o += o
-
-    print(f"{total_o}/{total_l} = {total_o/total_l}")
--- a/count_tag_freq.py
+++ b/count_tag_freq.py
@ -0,0 +1,23 @@
+from dataset import readPreporcssedDataAll
+from read_data import TagIdConverter
+from tqdm import tqdm
+from collections import Counter
+
+"""
+get frequency of tokens
+"""
+if __name__ == "__main__":
+    train, _, _ = readPreporcssedDataAll()
+    tagIdConverter = TagIdConverter()
+    counter = Counter()
+    total_l = 0
+
+    for item in tqdm(train):
+        entities = item["entity"]
+        for entity in entities:
+            counter[entity] += 1
+        total_l += len(entities)
+    print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
+    for token,count in counter.most_common():
+        tid = tagIdConverter.convert_tokens_to_ids([token])[0]
+        print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")
Author	SHA1	Message	Date
monoid	92205ea795	feat: add count freq and save model	2022-02-18 18:33:51 +09:00
monoid	e080077a53	feat: count tag frequency	2022-02-18 18:31:53 +09:00