From 883f39d6454d6505f8347cc50ed52a339d87f830 Mon Sep 17 00:00:00 2001 From: monoid Date: Tue, 22 Feb 2022 16:33:07 +0900 Subject: [PATCH] feat(read_data): add english data --- read_data.py | 75 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/read_data.py b/read_data.py index 1cd04d9..5be3817 100644 --- a/read_data.py +++ b/read_data.py @@ -1,5 +1,6 @@ import enum -from typing import NamedTuple, List, Sequence, TypeVar +from io import TextIOWrapper +from typing import Iterable, NamedTuple, List, Sequence, TypeVar import json KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" @@ -25,48 +26,64 @@ class Sentence(NamedTuple): self.namedEntity.append(namedEntity) self.detail.append(detail) T = TypeVar('T') -def readDataList(lst: List[T]): - ret = [] +def readDataList(lst: Iterable[str], sep="\t"): + ret:List[str] = [] for l in lst: - if len(l) > 0: - ret.append(l) - else: + l = l.strip() + if l == "": yield ret ret.clear() - -def readKoreanData(path: str) -> List[Sentence]: - fp = open(path,encoding="utf-8") - data = [] - for line in fp.readlines(): - line = line.strip() - if line == "": - data.append([]) else: - data.append(line.split("\t")) - fp.close() - # Do not use csv reader. - ret = [] + ret.append(l.split(sep)) - for lines in readDataList(data): +def readKoreanData(fp: TextIOWrapper) -> List[Sentence]: + ret = [] + # NOTE(monoid): Do not use csv reader. + for lines in readDataList(fp): sentence = Sentence([],[],[],[]) for line in lines: word_pos:str = line[0] words = word_pos.split("/") sentence.append(words[0],line[1],line[2],line[3]) ret.append(sentence) - + + fp.close() return ret +def readEnglishData(fp: TextIOWrapper) -> List[Sentence]: + ret = [] + for lines in readDataList(fp,sep=" "): + if len(lines) == 1 and lines[0][0] == "-DOCSTART-": + continue + sentence = Sentence([],[],[],[]) + for line in lines: + sentence.append(line[0],line[1],line[2],line[3]) + ret.append(sentence) + return ret def readKoreanDataAll(): """ - @return train, dev, test tuple Each entry is structured as follows: POS, + Return: train, dev, test tuple """ - dev = readKoreanData(f"{KoreanBase}/dev.txt") - test = readKoreanData(f"{KoreanBase}/test.txt") - train = readKoreanData(f"{KoreanBase}/train.txt") + with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp: + dev = readKoreanData(fp) + with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp: + test = readKoreanData(fp) + with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp: + train = readKoreanData(fp) + return train, dev, test + +def readEnglishDataAll(): + with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp: + print("a") + dev = readEnglishData(fp) + print("b") + with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp: + test = readEnglishData(fp) + with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp: + train = readEnglishData(fp) return train, dev, test class TagIdConverter: @@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c): break return ret - - """ extracts and stores tags set from the given data. """ @@ -160,7 +175,7 @@ if __name__ == "__main__": from tqdm import tqdm t = TagIdConverter() - train, dev, test = readKoreanDataAll() + train, dev, test = readEnglishDataAll() vocab = set() def getTags(lst: List[Sentence]): for s in tqdm(lst): @@ -173,7 +188,6 @@ if __name__ == "__main__": print("get tags from test...") getTags(test) print(vocab) - for v in vocab: if v == "O": continue @@ -183,7 +197,6 @@ if __name__ == "__main__": if not v in vocab: print("could not found pair " ,v) vocab.add(v) - tags = [{"name":"[PAD]","index":0}] i = 1 vocab_list = [*vocab] @@ -192,5 +205,5 @@ if __name__ == "__main__": tags.append({"name":v,"index":i}) i += 1 print(tags) - with open("tags.json","w",encoding="utf-8") as fp: - json.dump(tags,fp,ensure_ascii=False, indent=2) \ No newline at end of file + #with open("tags.json","w",encoding="utf-8") as fp: + # json.dump(tags,fp,ensure_ascii=False, indent=2) \ No newline at end of file