feat: support english

This commit is contained in:
monoid 2022-02-22 18:33:29 +09:00
parent 54e757c247
commit 28ddd289b7
4 changed files with 1825476 additions and 12 deletions

306118
engpre/dev.json Normal file

File diff suppressed because it is too large Load Diff

287552
engpre/test.json Normal file

File diff suppressed because it is too large Load Diff

1231776
engpre/train.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,11 @@
import argparse
import os
import sys import sys
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
from typing import Any, NamedTuple, List, Sequence, TypeVar from typing import Any, List
import json import json
import os.path as path
import tqdm import tqdm
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
@ -35,27 +36,44 @@ def readPreprocessedData(path: str):
with open(path,"r", encoding="utf-8") as fp: with open(path,"r", encoding="utf-8") as fp:
return json.load(fp) return json.load(fp)
def readPreporcssedDataAll(): def readPreporcssedDataAll(path = PRE_BASE_PATH):
train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json")) train = readPreprocessedData(os.path.join(path,"train.json"))
dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json")) dev = readPreprocessedData(os.path.join(path,"dev.json"))
test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json")) test = readPreprocessedData(os.path.join(path,"test.json"))
return train, dev, test return train, dev, test
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--kind", default="korean")
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
parser.add_argument("--tag", default="tags.json",help="path of tag description")
args = parser.parse_args()
dirPath = args.path
if args.kind == "korean":
rawTrain, rawDev, rawTest = readKoreanDataAll()
elif args.kind == "english":
rawTrain, rawDev, rawTest = readEnglishDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
converter = TagIdConverter(args.tag)
os.makedirs(dirPath)
from transformers import BertTokenizer from transformers import BertTokenizer
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased' PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
rawTrain, rawDev, rawTest = readKoreanDataAll()
print("load tokenzier...",file=sys.stderr) print("load tokenzier...",file=sys.stderr)
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME) tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
converter = TagIdConverter()
print("process train...",file=sys.stderr) print("process train...",file=sys.stderr)
train = preprocessing(tokenizer,converter,rawTrain) train = preprocessing(tokenizer,converter,rawTrain)
saveObject(path.join(PRE_BASE_PATH,"train.json"),train) saveObject(path.join(dirPath,"train.json"),train)
print("process dev...",file=sys.stderr) print("process dev...",file=sys.stderr)
dev = preprocessing(tokenizer,converter,rawDev) dev = preprocessing(tokenizer,converter,rawDev)
saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev) saveObject(path.join(dirPath,"dev.json"),dev)
print("process test...",file=sys.stderr) print("process test...",file=sys.stderr)
test = preprocessing(tokenizer,converter,rawTest) test = preprocessing(tokenizer,converter,rawTest)
saveObject(path.join(PRE_BASE_PATH,"test.json"),test) saveObject(path.join(dirPath,"test.json"),test)