import sys from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence from typing import Any, NamedTuple, List, Sequence, TypeVar import json import os.path as path import tqdm from transformers import PreTrainedTokenizer PRE_BASE_PATH = 'prepro' def preprocessing(tokenizer : PreTrainedTokenizer, converter :TagIdConverter,dataset: List[Sentence]): ret = [] for item in tqdm.tqdm(dataset): assert len(item.word) == len(item.detail) tokens = tokenizer.tokenize(" ".join(item.word)) e = make_long_namedEntity(item.word,tokens,item.detail) if len(e) != len(tokens): print(e,tokens) assert len(e) == len(tokens) ids = tokenizer.convert_tokens_to_ids(tokens) entityIds = converter.convert_tokens_to_ids(e) ret.append({"tokens":tokens,"ids":ids,"entity":e,"entity_ids": entityIds}) return ret def saveObject(path: str,data: Any): with open(path,"w",encoding="utf-8") as fp: json.dump(data,fp,ensure_ascii=False, indent=2) def readPreprocessedData(path: str): with open(path,"r", encoding="utf-8") as fp: return json.load(fp) def readPreporcssedDataAll(): train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json")) dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json")) test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json")) return train, dev, test if __name__ == "__main__": from transformers import BertTokenizer PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased' rawTrain, rawDev, rawTest = readKoreanDataAll() print("load tokenzier...",file=sys.stderr) tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME) converter = TagIdConverter() print("process train...",file=sys.stderr) train = preprocessing(tokenizer,converter,rawTrain) saveObject(path.join(PRE_BASE_PATH,"train.json"),train) print("process dev...",file=sys.stderr) dev = preprocessing(tokenizer,converter,rawDev) saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev) print("process test...",file=sys.stderr) test = preprocessing(tokenizer,converter,rawTest) saveObject(path.join(PRE_BASE_PATH,"test.json"),test)