feat(read_data): add english data

This commit is contained in:
monoid 2022-02-22 16:33:07 +09:00
parent bb1e0b5c64
commit 883f39d645
1 changed files with 44 additions and 31 deletions

View File

@ -1,5 +1,6 @@
import enum
from typing import NamedTuple, List, Sequence, TypeVar
from io import TextIOWrapper
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
import json
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
@ -25,48 +26,64 @@ class Sentence(NamedTuple):
self.namedEntity.append(namedEntity)
self.detail.append(detail)
T = TypeVar('T')
def readDataList(lst: List[T]):
ret = []
def readDataList(lst: Iterable[str], sep="\t"):
ret:List[str] = []
for l in lst:
if len(l) > 0:
ret.append(l)
else:
l = l.strip()
if l == "":
yield ret
ret.clear()
def readKoreanData(path: str) -> List[Sentence]:
fp = open(path,encoding="utf-8")
data = []
for line in fp.readlines():
line = line.strip()
if line == "":
data.append([])
else:
data.append(line.split("\t"))
fp.close()
# Do not use csv reader.
ret = []
ret.append(l.split(sep))
for lines in readDataList(data):
def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
ret = []
# NOTE(monoid): Do not use csv reader.
for lines in readDataList(fp):
sentence = Sentence([],[],[],[])
for line in lines:
word_pos:str = line[0]
words = word_pos.split("/")
sentence.append(words[0],line[1],line[2],line[3])
ret.append(sentence)
fp.close()
return ret
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
ret = []
for lines in readDataList(fp,sep=" "):
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
continue
sentence = Sentence([],[],[],[])
for line in lines:
sentence.append(line[0],line[1],line[2],line[3])
ret.append(sentence)
return ret
def readKoreanDataAll():
"""
@return train, dev, test tuple
Each entry is structured as follows:
POS,
Return: train, dev, test tuple
"""
dev = readKoreanData(f"{KoreanBase}/dev.txt")
test = readKoreanData(f"{KoreanBase}/test.txt")
train = readKoreanData(f"{KoreanBase}/train.txt")
with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
dev = readKoreanData(fp)
with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
test = readKoreanData(fp)
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
train = readKoreanData(fp)
return train, dev, test
def readEnglishDataAll():
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
print("a")
dev = readEnglishData(fp)
print("b")
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
test = readEnglishData(fp)
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
train = readEnglishData(fp)
return train, dev, test
class TagIdConverter:
@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c):
break
return ret
"""
extracts and stores tags set from the given data.
"""
@ -160,7 +175,7 @@ if __name__ == "__main__":
from tqdm import tqdm
t = TagIdConverter()
train, dev, test = readKoreanDataAll()
train, dev, test = readEnglishDataAll()
vocab = set()
def getTags(lst: List[Sentence]):
for s in tqdm(lst):
@ -173,7 +188,6 @@ if __name__ == "__main__":
print("get tags from test...")
getTags(test)
print(vocab)
for v in vocab:
if v == "O":
continue
@ -183,7 +197,6 @@ if __name__ == "__main__":
if not v in vocab:
print("could not found pair " ,v)
vocab.add(v)
tags = [{"name":"[PAD]","index":0}]
i = 1
vocab_list = [*vocab]
@ -192,5 +205,5 @@ if __name__ == "__main__":
tags.append({"name":v,"index":i})
i += 1
print(tags)
with open("tags.json","w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2)
#with open("tags.json","w",encoding="utf-8") as fp:
# json.dump(tags,fp,ensure_ascii=False, indent=2)