feat: support english
This commit is contained in:
parent
54e757c247
commit
28ddd289b7
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,10 +1,11 @@
|
|||
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
|
||||
from typing import Any, NamedTuple, List, Sequence, TypeVar
|
||||
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
|
||||
from typing import Any, List
|
||||
import json
|
||||
import os.path as path
|
||||
import tqdm
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
|
@ -35,27 +36,44 @@ def readPreprocessedData(path: str):
|
|||
with open(path,"r", encoding="utf-8") as fp:
|
||||
return json.load(fp)
|
||||
|
||||
def readPreporcssedDataAll():
|
||||
train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json"))
|
||||
dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json"))
|
||||
test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json"))
|
||||
def readPreporcssedDataAll(path = PRE_BASE_PATH):
|
||||
train = readPreprocessedData(os.path.join(path,"train.json"))
|
||||
dev = readPreprocessedData(os.path.join(path,"dev.json"))
|
||||
test = readPreprocessedData(os.path.join(path,"test.json"))
|
||||
return train, dev, test
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--kind", default="korean")
|
||||
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
|
||||
parser.add_argument("--tag", default="tags.json",help="path of tag description")
|
||||
|
||||
args = parser.parse_args()
|
||||
dirPath = args.path
|
||||
|
||||
if args.kind == "korean":
|
||||
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
||||
elif args.kind == "english":
|
||||
rawTrain, rawDev, rawTest = readEnglishDataAll()
|
||||
else:
|
||||
print("unknown language",file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
converter = TagIdConverter(args.tag)
|
||||
os.makedirs(dirPath)
|
||||
|
||||
from transformers import BertTokenizer
|
||||
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
||||
|
||||
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
||||
print("load tokenzier...",file=sys.stderr)
|
||||
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
||||
converter = TagIdConverter()
|
||||
|
||||
print("process train...",file=sys.stderr)
|
||||
train = preprocessing(tokenizer,converter,rawTrain)
|
||||
saveObject(path.join(PRE_BASE_PATH,"train.json"),train)
|
||||
saveObject(path.join(dirPath,"train.json"),train)
|
||||
print("process dev...",file=sys.stderr)
|
||||
dev = preprocessing(tokenizer,converter,rawDev)
|
||||
saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev)
|
||||
saveObject(path.join(dirPath,"dev.json"),dev)
|
||||
print("process test...",file=sys.stderr)
|
||||
test = preprocessing(tokenizer,converter,rawTest)
|
||||
saveObject(path.join(PRE_BASE_PATH,"test.json"),test)
|
||||
saveObject(path.join(dirPath,"test.json"),test)
|
||||
|
|
Loading…
Reference in New Issue