import argparse import os import sys from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence from typing import Any, List import json import tqdm from transformers import PreTrainedTokenizer PRE_BASE_PATH = 'prepro' def preprocessing(tokenizer : PreTrainedTokenizer, converter :TagIdConverter,dataset: List[Sentence]): ret = [] for item in tqdm.tqdm(dataset): assert len(item.word) == len(item.detail) tokens = tokenizer.tokenize(" ".join(item.word)) e = make_long_namedEntity(item.word,tokens,item.detail) if len(e) != len(tokens): print(e,tokens) assert len(e) == len(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) entityIds = converter.convert_tokens_to_ids(e) ret.append({"tokens":tokens,"ids":ids,"entity":e,"entity_ids": entityIds}) return ret def saveObject(path: str,data: Any): with open(path,"w",encoding="utf-8") as fp: json.dump(data,fp,ensure_ascii=False, indent=2) def readPreprocessedData(path: str): with open(path,"r", encoding="utf-8") as fp: return json.load(fp) def readPreporcssedDataAll(path = PRE_BASE_PATH): train = readPreprocessedData(os.path.join(path,"train.json")) dev = readPreprocessedData(os.path.join(path,"dev.json")) test = readPreprocessedData(os.path.join(path,"test.json")) return train, dev, test if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--kind", default="korean") parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data") parser.add_argument("--tag", default="tags.json",help="path of tag description") args = parser.parse_args() dirPath = args.path if args.kind == "korean": rawTrain, rawDev, rawTest = readKoreanDataAll() elif args.kind == "english": rawTrain, rawDev, rawTest = readEnglishDataAll() else: print("unknown language",file=sys.stderr) exit(1) converter = TagIdConverter(args.tag) os.makedirs(dirPath) from transformers import BertTokenizer PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased' print("load tokenzier...",file=sys.stderr) tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME) print("process train...",file=sys.stderr) train = preprocessing(tokenizer,converter,rawTrain) saveObject(path.join(dirPath,"train.json"),train) print("process dev...",file=sys.stderr) dev = preprocessing(tokenizer,converter,rawDev) saveObject(path.join(dirPath,"dev.json"),dev) print("process test...",file=sys.stderr) test = preprocessing(tokenizer,converter,rawTest) saveObject(path.join(dirPath,"test.json"),test)