from dataset import readPreporcssedDataAll from read_data import TagIdConverter from tqdm import tqdm from collections import Counter """ get frequency of tokens """ if __name__ == "__main__": train, _, _ = readPreporcssedDataAll() tagIdConverter = TagIdConverter() counter = Counter() total_l = 0 for item in tqdm(train): entities = item["entity"] for entity in entities: counter[entity] += 1 total_l += len(entities) print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}") for token,count in counter.most_common(): tid = tagIdConverter.convert_tokens_to_ids([token])[0] print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")