feat: get args

This commit is contained in:
monoid 2022-02-22 17:20:16 +09:00
parent 609174b089
commit 142ad917bc
1 changed files with 27 additions and 8 deletions

View File

@ -1,7 +1,9 @@
import enum
from io import TextIOWrapper
import sys
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
import json
import argparse
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
@ -170,20 +172,33 @@ def make_long_namedEntity(a,b,c):
extracts and stores tags set from the given data.
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create tags list")
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
parser.add_argument("--path",default="tags.json", help="path of tags data")
args = parser.parse_args()
from tqdm import tqdm
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor":
train, dev, test = readEnglishDataAll()
elif args.kind == "english" or args.kind == "en" or args.kind =="eng":
train, dev, test = readKoreanDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
vocab = set()
def getTags(lst: List[Sentence]):
for s in tqdm(lst):
for e in s.detail:
vocab.add(e)
print("get tags from train...")
print("get tags from train...",file=sys.stderr)
getTags(train)
print("get tags from dev...")
print("get tags from dev...",file=sys.stderr)
getTags(dev)
print("get tags from test...")
print("get tags from test...",file=sys.stderr)
getTags(test)
print(vocab)
print(vocab,file=sys.stderr)
for v in vocab:
if v == "O":
continue
@ -200,6 +215,10 @@ if __name__ == "__main__":
for v in vocab_list:
tags.append({"name":v,"index":i})
i += 1
print(tags)
with open("eng_tags.json","w",encoding="utf-8") as fp:
print(tags,file=sys.stderr)
if args.stdout:
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2)
else:
p = args.path
with open(p,"w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2)