diff --git a/count_o.py b/count_o.py deleted file mode 100644 index 5ce6c52..0000000 --- a/count_o.py +++ /dev/null @@ -1,22 +0,0 @@ -from dataset import readPreporcssedDataAll -from tqdm import tqdm - - -""" -count outside tokens(O tokens) -""" -if __name__ == "__main__": - - train, _, _ = readPreporcssedDataAll() - - total_l = 0 - total_o = 0 - - for item in tqdm(train): - entities = item["entity"] - l = len(entities) - o = sum(map(lambda x: 1 if x == "O" else 0,entities)) - total_l += l - total_o += o - - print(f"{total_o}/{total_l} = {total_o/total_l}") \ No newline at end of file diff --git a/count_tag_freq.py b/count_tag_freq.py new file mode 100644 index 0000000..b1fc391 --- /dev/null +++ b/count_tag_freq.py @@ -0,0 +1,23 @@ +from dataset import readPreporcssedDataAll +from read_data import TagIdConverter +from tqdm import tqdm +from collections import Counter + +""" +get frequency of tokens +""" +if __name__ == "__main__": + train, _, _ = readPreporcssedDataAll() + tagIdConverter = TagIdConverter() + counter = Counter() + total_l = 0 + + for item in tqdm(train): + entities = item["entity"] + for entity in entities: + counter[entity] += 1 + total_l += len(entities) + print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}") + for token,count in counter.most_common(): + tid = tagIdConverter.convert_tokens_to_ids([token])[0] + print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%") \ No newline at end of file