feat: count tag frequency

This commit is contained in:
monoid 2022-02-18 18:31:53 +09:00
parent d2256b0ee9
commit e080077a53
2 changed files with 23 additions and 22 deletions

View File

@ -1,22 +0,0 @@
from dataset import readPreporcssedDataAll
from tqdm import tqdm
"""
count outside tokens(O tokens)
"""
if __name__ == "__main__":
train, _, _ = readPreporcssedDataAll()
total_l = 0
total_o = 0
for item in tqdm(train):
entities = item["entity"]
l = len(entities)
o = sum(map(lambda x: 1 if x == "O" else 0,entities))
total_l += l
total_o += o
print(f"{total_o}/{total_l} = {total_o/total_l}")

23
count_tag_freq.py Normal file
View File

@ -0,0 +1,23 @@
from dataset import readPreporcssedDataAll
from read_data import TagIdConverter
from tqdm import tqdm
from collections import Counter
"""
get frequency of tokens
"""
if __name__ == "__main__":
train, _, _ = readPreporcssedDataAll()
tagIdConverter = TagIdConverter()
counter = Counter()
total_l = 0
for item in tqdm(train):
entities = item["entity"]
for entity in entities:
counter[entity] += 1
total_l += len(entities)
print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
for token,count in counter.most_common():
tid = tagIdConverter.convert_tokens_to_ids([token])[0]
print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")