feat: support english
This commit is contained in:
parent
54e757c247
commit
28ddd289b7
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,10 +1,11 @@
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
|
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
|
||||||
from typing import Any, NamedTuple, List, Sequence, TypeVar
|
from typing import Any, List
|
||||||
import json
|
import json
|
||||||
import os.path as path
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
|
@ -35,27 +36,44 @@ def readPreprocessedData(path: str):
|
||||||
with open(path,"r", encoding="utf-8") as fp:
|
with open(path,"r", encoding="utf-8") as fp:
|
||||||
return json.load(fp)
|
return json.load(fp)
|
||||||
|
|
||||||
def readPreporcssedDataAll():
|
def readPreporcssedDataAll(path = PRE_BASE_PATH):
|
||||||
train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json"))
|
train = readPreprocessedData(os.path.join(path,"train.json"))
|
||||||
dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json"))
|
dev = readPreprocessedData(os.path.join(path,"dev.json"))
|
||||||
test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json"))
|
test = readPreprocessedData(os.path.join(path,"test.json"))
|
||||||
return train, dev, test
|
return train, dev, test
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--kind", default="korean")
|
||||||
|
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
|
||||||
|
parser.add_argument("--tag", default="tags.json",help="path of tag description")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
dirPath = args.path
|
||||||
|
|
||||||
|
if args.kind == "korean":
|
||||||
|
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
||||||
|
elif args.kind == "english":
|
||||||
|
rawTrain, rawDev, rawTest = readEnglishDataAll()
|
||||||
|
else:
|
||||||
|
print("unknown language",file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
converter = TagIdConverter(args.tag)
|
||||||
|
os.makedirs(dirPath)
|
||||||
|
|
||||||
from transformers import BertTokenizer
|
from transformers import BertTokenizer
|
||||||
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
||||||
|
|
||||||
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
|
||||||
print("load tokenzier...",file=sys.stderr)
|
print("load tokenzier...",file=sys.stderr)
|
||||||
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
||||||
converter = TagIdConverter()
|
|
||||||
|
|
||||||
print("process train...",file=sys.stderr)
|
print("process train...",file=sys.stderr)
|
||||||
train = preprocessing(tokenizer,converter,rawTrain)
|
train = preprocessing(tokenizer,converter,rawTrain)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"train.json"),train)
|
saveObject(path.join(dirPath,"train.json"),train)
|
||||||
print("process dev...",file=sys.stderr)
|
print("process dev...",file=sys.stderr)
|
||||||
dev = preprocessing(tokenizer,converter,rawDev)
|
dev = preprocessing(tokenizer,converter,rawDev)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev)
|
saveObject(path.join(dirPath,"dev.json"),dev)
|
||||||
print("process test...",file=sys.stderr)
|
print("process test...",file=sys.stderr)
|
||||||
test = preprocessing(tokenizer,converter,rawTest)
|
test = preprocessing(tokenizer,converter,rawTest)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"test.json"),test)
|
saveObject(path.join(dirPath,"test.json"),test)
|
||||||
|
|
Loading…
Reference in New Issue