diff --git a/preprocessing.py b/preprocessing.py index 19c52ab..9e35fc9 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -1,5 +1,6 @@ +import sys from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence from typing import Any, NamedTuple, List, Sequence, TypeVar import json @@ -45,16 +46,16 @@ if __name__ == "__main__": PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased' rawTrain, rawDev, rawTest = readKoreanDataAll() - print("load tokenzier...") + print("load tokenzier...",file=sys.stderr) tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME) converter = TagIdConverter() - print("process train...") + print("process train...",file=sys.stderr) train = preprocessing(tokenizer,converter,rawTrain) saveObject(path.join(PRE_BASE_PATH,"train.json"),train) - print("process dev...") + print("process dev...",file=sys.stderr) dev = preprocessing(tokenizer,converter,rawDev) saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev) - print("process test...") + print("process test...",file=sys.stderr) test = preprocessing(tokenizer,converter,rawTest) saveObject(path.join(PRE_BASE_PATH,"test.json"),test)