23 lines
729 B
Python
23 lines
729 B
Python
|
from dataset import readPreporcssedDataAll
|
||
|
from read_data import TagIdConverter
|
||
|
from tqdm import tqdm
|
||
|
from collections import Counter
|
||
|
|
||
|
"""
|
||
|
get frequency of tokens
|
||
|
"""
|
||
|
if __name__ == "__main__":
|
||
|
train, _, _ = readPreporcssedDataAll()
|
||
|
tagIdConverter = TagIdConverter()
|
||
|
counter = Counter()
|
||
|
total_l = 0
|
||
|
|
||
|
for item in tqdm(train):
|
||
|
entities = item["entity"]
|
||
|
for entity in entities:
|
||
|
counter[entity] += 1
|
||
|
total_l += len(entities)
|
||
|
print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
|
||
|
for token,count in counter.most_common():
|
||
|
tid = tagIdConverter.convert_tokens_to_ids([token])[0]
|
||
|
print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")
|