ner-study/count_tag_freq.py
2022-02-18 18:31:53 +09:00

23 lines
729 B
Python

from dataset import readPreporcssedDataAll
from read_data import TagIdConverter
from tqdm import tqdm
from collections import Counter
"""
get frequency of tokens
"""
if __name__ == "__main__":
train, _, _ = readPreporcssedDataAll()
tagIdConverter = TagIdConverter()
counter = Counter()
total_l = 0
for item in tqdm(train):
entities = item["entity"]
for entity in entities:
counter[entity] += 1
total_l += len(entities)
print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
for token,count in counter.most_common():
tid = tagIdConverter.convert_tokens_to_ids([token])[0]
print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")