diff --git a/read_data.py b/read_data.py index 93e276d..8b7851e 100644 --- a/read_data.py +++ b/read_data.py @@ -1,7 +1,9 @@ import enum from io import TextIOWrapper +import sys from typing import Iterable, NamedTuple, List, Sequence, TypeVar import json +import argparse KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS" @@ -170,20 +172,33 @@ def make_long_namedEntity(a,b,c): extracts and stores tags set from the given data. """ if __name__ == "__main__": + parser = argparse.ArgumentParser(description="create tags list") + parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english') + parser.add_argument("--stdout",action='store_true',help='print tags data to stdout') + parser.add_argument("--path",default="tags.json", help="path of tags data") + + args = parser.parse_args() + from tqdm import tqdm - train, dev, test = readEnglishDataAll() + if args.kind == "korean" or args.kind == "ko" or args.kind == "kor": + train, dev, test = readEnglishDataAll() + elif args.kind == "english" or args.kind == "en" or args.kind =="eng": + train, dev, test = readKoreanDataAll() + else: + print("unknown language",file=sys.stderr) + exit(1) vocab = set() def getTags(lst: List[Sentence]): for s in tqdm(lst): for e in s.detail: vocab.add(e) - print("get tags from train...") + print("get tags from train...",file=sys.stderr) getTags(train) - print("get tags from dev...") + print("get tags from dev...",file=sys.stderr) getTags(dev) - print("get tags from test...") + print("get tags from test...",file=sys.stderr) getTags(test) - print(vocab) + print(vocab,file=sys.stderr) for v in vocab: if v == "O": continue @@ -200,6 +215,10 @@ if __name__ == "__main__": for v in vocab_list: tags.append({"name":v,"index":i}) i += 1 - print(tags) - with open("eng_tags.json","w",encoding="utf-8") as fp: - json.dump(tags,fp,ensure_ascii=False, indent=2) \ No newline at end of file + print(tags,file=sys.stderr) + if args.stdout: + json.dump(tags,sys.stdout,ensure_ascii=False, indent=2) + else: + p = args.path + with open(p,"w",encoding="utf-8") as fp: + json.dump(tags,fp,ensure_ascii=False, indent=2) \ No newline at end of file