61 lines
2.1 KiB
Python
61 lines
2.1 KiB
Python
|
|
|
|
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
|
|
from typing import Any, NamedTuple, List, Sequence, TypeVar
|
|
import json
|
|
import os.path as path
|
|
import tqdm
|
|
from transformers import PreTrainedTokenizer
|
|
|
|
PREPROCESSING_BASE_PATH = 'prepro'
|
|
converter = TagIdConverter()
|
|
|
|
def preprocessing(tokenizer : PreTrainedTokenizer,dataset: List[Sentence]):
|
|
ret = []
|
|
for item in tqdm.tqdm(dataset):
|
|
assert len(item.word) == len(item.detail)
|
|
tokens = tokenizer.tokenize(" ".join(item.word))
|
|
e = make_long_namedEntity(item.word,tokens,item.detail)
|
|
if len(e) != len(tokens):
|
|
print(e,tokens)
|
|
assert len(e) == len(tokens)
|
|
|
|
ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
entityIds = converter.convert_tokens_to_ids(e)
|
|
|
|
ret.append({"tokens":tokens,"ids":ids,"entity":e,"entity_ids": entityIds})
|
|
|
|
return ret
|
|
|
|
def saveObject(path: str,data: Any):
|
|
with open(path,"w",encoding="utf-8") as fp:
|
|
json.dump(data,fp,ensure_ascii=False, indent=2)
|
|
|
|
def readPreprocessedData(path: str):
|
|
with open(path,"r", encoding="utf-8") as fp:
|
|
return json.load(fp)
|
|
|
|
def readPreporcssedDataAll():
|
|
train = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"train.json"))
|
|
dev = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"dev.json"))
|
|
test = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"test.json"))
|
|
return train, dev, test
|
|
|
|
if __name__ == "__main__":
|
|
from transformers import BertTokenizer
|
|
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
|
|
|
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
|
print("load tokenzier...")
|
|
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
|
|
|
print("process train...")
|
|
train = preprocessing(tokenizer,rawTrain)
|
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"train.json"),train)
|
|
print("process dev...")
|
|
dev = preprocessing(tokenizer,rawDev)
|
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"dev.json"),dev)
|
|
print("process test...")
|
|
test = preprocessing(tokenizer,rawTest)
|
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"test.json"),test)
|