From e080077a53fe2c5f14492e0ce6e084ebc101bcb2 Mon Sep 17 00:00:00 2001
From: monoid <qaswedfr55@gmail.com>
Date: Fri, 18 Feb 2022 18:31:53 +0900
Subject: [PATCH] feat: count tag frequency

---
 count_o.py        | 22 ----------------------
 count_tag_freq.py | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 22 deletions(-)
 delete mode 100644 count_o.py
 create mode 100644 count_tag_freq.py

diff --git a/count_o.py b/count_o.py
deleted file mode 100644
index 5ce6c52..0000000
--- a/count_o.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from dataset import readPreporcssedDataAll
-from tqdm import tqdm
-
-
-"""
-count outside tokens(O tokens)
-"""
-if __name__ == "__main__":
-
-    train, _, _ = readPreporcssedDataAll()
-
-    total_l = 0
-    total_o = 0
-
-    for item in tqdm(train):
-        entities = item["entity"]
-        l = len(entities)
-        o = sum(map(lambda x: 1 if x == "O" else 0,entities))
-        total_l += l
-        total_o += o
-
-    print(f"{total_o}/{total_l} = {total_o/total_l}")
\ No newline at end of file
diff --git a/count_tag_freq.py b/count_tag_freq.py
new file mode 100644
index 0000000..b1fc391
--- /dev/null
+++ b/count_tag_freq.py
@@ -0,0 +1,23 @@
+from dataset import readPreporcssedDataAll
+from read_data import TagIdConverter
+from tqdm import tqdm
+from collections import Counter
+
+"""
+get frequency of tokens
+"""
+if __name__ == "__main__":
+    train, _, _ = readPreporcssedDataAll()
+    tagIdConverter = TagIdConverter()
+    counter = Counter()
+    total_l = 0
+
+    for item in tqdm(train):
+        entities = item["entity"]
+        for entity in entities:
+            counter[entity] += 1
+        total_l += len(entities)
+    print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
+    for token,count in counter.most_common():
+        tid = tagIdConverter.convert_tokens_to_ids([token])[0]
+        print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")
\ No newline at end of file