From 883f39d6454d6505f8347cc50ed52a339d87f830 Mon Sep 17 00:00:00 2001
From: monoid <qaswedfr55@gmail.com>
Date: Tue, 22 Feb 2022 16:33:07 +0900
Subject: [PATCH] feat(read_data): add english data

---
 read_data.py | 75 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/read_data.py b/read_data.py
index 1cd04d9..5be3817 100644
--- a/read_data.py
+++ b/read_data.py
@@ -1,5 +1,6 @@
 import enum
-from typing import NamedTuple, List, Sequence, TypeVar
+from io import TextIOWrapper
+from typing import Iterable, NamedTuple, List, Sequence, TypeVar
 import json
 
 KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
@@ -25,48 +26,64 @@ class Sentence(NamedTuple):
         self.namedEntity.append(namedEntity)
         self.detail.append(detail)
 T = TypeVar('T')
-def readDataList(lst: List[T]):
-    ret = []
+def readDataList(lst: Iterable[str], sep="\t"):
+    ret:List[str] = []
     for l in lst:
-        if len(l) > 0:
-            ret.append(l)
-        else:
+        l = l.strip()
+        if l == "":
             yield ret
             ret.clear()
-
-def readKoreanData(path: str) -> List[Sentence]:
-    fp = open(path,encoding="utf-8")
-    data = []
-    for line in fp.readlines():
-        line = line.strip()
-        if line == "":
-            data.append([])
         else:
-            data.append(line.split("\t"))
-    fp.close()
-    # Do not use csv reader.
-    ret = []
+            ret.append(l.split(sep))
 
-    for lines in readDataList(data):
+def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
+    ret = []
+    # NOTE(monoid): Do not use csv reader.
+    for lines in readDataList(fp):
         sentence = Sentence([],[],[],[])
         for line in lines:
             word_pos:str = line[0]
             words = word_pos.split("/")
             sentence.append(words[0],line[1],line[2],line[3])
         ret.append(sentence)
-
+    
+    fp.close()
     return ret
 
+def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
+    ret = []
+    for lines in readDataList(fp,sep=" "):
+        if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
+            continue
+        sentence = Sentence([],[],[],[])
+        for line in lines:
+            sentence.append(line[0],line[1],line[2],line[3])
+        ret.append(sentence)
+    return ret
 
 def readKoreanDataAll():
     """
-        @return train, dev, test tuple
         Each entry is structured as follows:
             POS, 
+        Return: train, dev, test tuple
     """
-    dev = readKoreanData(f"{KoreanBase}/dev.txt")
-    test = readKoreanData(f"{KoreanBase}/test.txt")
-    train = readKoreanData(f"{KoreanBase}/train.txt")
+    with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
+        dev = readKoreanData(fp)
+    with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
+        test = readKoreanData(fp)
+    with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
+        train = readKoreanData(fp)
+    return train, dev, test
+
+def readEnglishDataAll():
+    with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
+        print("a")
+        dev = readEnglishData(fp)
+        print("b")
+    with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
+        test = readEnglishData(fp)
+    with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
+        train = readEnglishData(fp)
     return train, dev, test
 
 class TagIdConverter:
@@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c):
             break
     return ret
 
-
-
 """
 extracts and stores tags set from the given data. 
 """
@@ -160,7 +175,7 @@ if __name__ == "__main__":
     from tqdm import tqdm
     t = TagIdConverter()
     
-    train, dev, test = readKoreanDataAll()
+    train, dev, test = readEnglishDataAll()
     vocab = set()
     def getTags(lst: List[Sentence]):
         for s in tqdm(lst):
@@ -173,7 +188,6 @@ if __name__ == "__main__":
     print("get tags from test...")
     getTags(test)
     print(vocab)
-
     for v in vocab:
         if v == "O":
             continue
@@ -183,7 +197,6 @@ if __name__ == "__main__":
         if not v in vocab:
             print("could not found pair " ,v)
             vocab.add(v)
-
     tags = [{"name":"[PAD]","index":0}]
     i = 1
     vocab_list = [*vocab]
@@ -192,5 +205,5 @@ if __name__ == "__main__":
         tags.append({"name":v,"index":i})
         i += 1
     print(tags)
-    with open("tags.json","w",encoding="utf-8") as fp:
-        json.dump(tags,fp,ensure_ascii=False, indent=2)
\ No newline at end of file
+    #with open("tags.json","w",encoding="utf-8") as fp:
+    #    json.dump(tags,fp,ensure_ascii=False, indent=2)
\ No newline at end of file