feat: add count freq and save model

feat: count tag frequency
2022-02-18 18:33:51 +09:00 · 2022-02-18 18:31:53 +09:00
4 changed files with 150 additions and 30 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,4 @@
 __pycache__
 prepro/**/*
 tags.json
 model.zip
--- a/Training.ipynb
+++ b/Training.ipynb
@ -724,7 +724,7 @@
  {
   "cell_type": "code",
   "execution_count": 36,
-   "id": "2a35055b",
+   "id": "bd9b45b3",
   "metadata": {},
   "outputs": [
    {
@ -747,7 +747,7 @@
  {
   "cell_type": "code",
   "execution_count": 39,
-   "id": "778c99b7",
+   "id": "3d62b34a",
   "metadata": {},
   "outputs": [
    {
@ -770,7 +770,7 @@
  {
   "cell_type": "code",
   "execution_count": 38,
-   "id": "798091aa",
+   "id": "c42f4980",
   "metadata": {},
   "outputs": [
    {
@ -1021,7 +1021,7 @@
  {
   "cell_type": "code",
   "execution_count": 27,
-   "id": "78e46670",
+   "id": "9664aa89",
   "metadata": {},
   "outputs": [
    {
@ -1042,7 +1042,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ed61ce06",
+   "id": "2404426f",
   "metadata": {},
   "source": [
    "`groupby_index` 그룹으로 묶어서 실행"
@ -1674,7 +1674,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "4830938c",
+   "id": "8c02816f",
   "metadata": {},
   "source": [
    "Outside 토큰에 해당하는 곳을 짜르겠습니다."
@ -1806,10 +1806,128 @@
    "    print(f\"class {i} f1 score : {f1}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8da8a232",
   "metadata": {},
   "source": [
    "nan 나온 것에 대해서 생각해보자."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "9747e8de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import collections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "676f8f16",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "5000it [00:00, 90912.13it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "token       \t       count   frequency%\n",
      "O(21)       \t      174832      79.596%\n",
      "I-PS(17)    \t       12555       5.716%\n",
      "I-OG(16)    \t       10927       4.975%\n",
      "B-PS(7)     \t        4726       2.152%\n",
      "I-DT(14)    \t        4407       2.006%\n",
      "B-OG(6)     \t        3782       1.722%\n",
      "I-LC(15)    \t        2365       1.077%\n",
      "B-DT(4)     \t        2338       1.064%\n",
      "B-LC(5)     \t        2217       1.009%\n",
      "I-TI(18)    \t        1030       0.469%\n",
      "B-TI(8)     \t         397       0.181%\n",
      "I-목소(19)    \t          32       0.015%\n",
      "I-(11)      \t          15       0.007%\n",
      "I-조선(20)    \t           8       0.004%\n",
      "I-1(12)     \t           5       0.002%\n",
      "B-(1)       \t           4       0.002%\n",
      "I-<휠(13)    \t           4       0.002%\n",
      "B-조선(10)    \t           1       0.000%\n",
      "B-목소(9)     \t           1       0.000%\n",
      "B-<휠(3)     \t           1       0.000%\n",
      "B-1(2)      \t           1       0.000%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tagIdConverter = TagIdConverter()\n",
    "counter = collections.Counter()\n",
    "total_l = 0\n",
    "\n",
    "for item in tqdm(itertools.chain(datasetTrain,datasetDev,datasetTest)):\n",
    "    entities = item[\"entity\"]\n",
    "    for entity in entities:\n",
    "        counter[entity] += 1\n",
    "    total_l += len(entities)\n",
    "print(f\"{'token':<12}\\t{'count':>12} {'frequency%':>12}\")\n",
    "for token,count in counter.most_common():\n",
    "    tid = tagIdConverter.convert_tokens_to_ids([token])[0]\n",
    "    print(f\"{f'{token}({tid})':<12}\\t{count:>12}{count*100/total_l:>12.3f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c4191a3",
   "metadata": {},
   "source": [
    "19, 11, 20, 12, 1, 13, 10, 9, 3, 2 번은 데이터 규모에 비해서 유의미한 데이터가 아니다. 샘플이 너무 적어서 학습하기에 부적절하다."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f86eeca1",
   "metadata": {},
   "source": [
    "모델을 저장해보자"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "4ff86a28",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.save(model.state_dict(), \"model.zip\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0db7cd17",
   "metadata": {},
   "source": [
    "다음과 같이 하면 저장됨."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0b9b55e7",
+   "id": "1ea835d4",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/count_o.py
+++ b/count_o.py
@ -1,22 +0,0 @@
 from dataset import readPreporcssedDataAll
 from tqdm import tqdm
 """
 count outside tokens(O tokens)
 """
 if __name__ == "__main__":
    train, _, _ = readPreporcssedDataAll()
    total_l = 0
    total_o = 0
    for item in tqdm(train):
        entities = item["entity"]
        l = len(entities)
        o = sum(map(lambda x: 1 if x == "O" else 0,entities))
        total_l += l
        total_o += o
    print(f"{total_o}/{total_l} = {total_o/total_l}")
--- a/count_tag_freq.py
+++ b/count_tag_freq.py
@ -0,0 +1,23 @@
 from dataset import readPreporcssedDataAll
 from read_data import TagIdConverter
 from tqdm import tqdm
 from collections import Counter
 """
 get frequency of tokens
 """
 if __name__ == "__main__":
    train, _, _ = readPreporcssedDataAll()
    tagIdConverter = TagIdConverter()
    counter = Counter()
    total_l = 0
    for item in tqdm(train):
        entities = item["entity"]
        for entity in entities:
            counter[entity] += 1
        total_l += len(entities)
    print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
    for token,count in counter.most_common():
        tid = tagIdConverter.convert_tokens_to_ids([token])[0]
        print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")
Author	SHA1	Message	Date
monoid	92205ea795	feat: add count freq and save model	2022-02-18 18:33:51 +09:00
monoid	e080077a53	feat: count tag frequency	2022-02-18 18:31:53 +09:00