feat(read_data): add english data
This commit is contained in:
		
							parent
							
								
									bb1e0b5c64
								
							
						
					
					
						commit
						883f39d645
					
				
					 1 changed files with 44 additions and 31 deletions
				
			
		
							
								
								
									
										73
									
								
								read_data.py
									
										
									
									
									
								
							
							
						
						
									
										73
									
								
								read_data.py
									
										
									
									
									
								
							|  | @ -1,5 +1,6 @@ | ||||||
| import enum | import enum | ||||||
| from typing import NamedTuple, List, Sequence, TypeVar | from io import TextIOWrapper | ||||||
|  | from typing import Iterable, NamedTuple, List, Sequence, TypeVar | ||||||
| import json | import json | ||||||
| 
 | 
 | ||||||
| KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" | KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" | ||||||
|  | @ -25,29 +26,20 @@ class Sentence(NamedTuple): | ||||||
|         self.namedEntity.append(namedEntity) |         self.namedEntity.append(namedEntity) | ||||||
|         self.detail.append(detail) |         self.detail.append(detail) | ||||||
| T = TypeVar('T') | T = TypeVar('T') | ||||||
| def readDataList(lst: List[T]): | def readDataList(lst: Iterable[str], sep="\t"): | ||||||
|     ret = [] |     ret:List[str] = [] | ||||||
|     for l in lst: |     for l in lst: | ||||||
|         if len(l) > 0: |         l = l.strip() | ||||||
|             ret.append(l) |         if l == "": | ||||||
|         else: |  | ||||||
|             yield ret |             yield ret | ||||||
|             ret.clear() |             ret.clear() | ||||||
| 
 |  | ||||||
| def readKoreanData(path: str) -> List[Sentence]: |  | ||||||
|     fp = open(path,encoding="utf-8") |  | ||||||
|     data = [] |  | ||||||
|     for line in fp.readlines(): |  | ||||||
|         line = line.strip() |  | ||||||
|         if line == "": |  | ||||||
|             data.append([]) |  | ||||||
|         else: |         else: | ||||||
|             data.append(line.split("\t")) |             ret.append(l.split(sep)) | ||||||
|     fp.close() |  | ||||||
|     # Do not use csv reader. |  | ||||||
|     ret = [] |  | ||||||
| 
 | 
 | ||||||
|     for lines in readDataList(data): | def readKoreanData(fp: TextIOWrapper) -> List[Sentence]: | ||||||
|  |     ret = [] | ||||||
|  |     # NOTE(monoid): Do not use csv reader. | ||||||
|  |     for lines in readDataList(fp): | ||||||
|         sentence = Sentence([],[],[],[]) |         sentence = Sentence([],[],[],[]) | ||||||
|         for line in lines: |         for line in lines: | ||||||
|             word_pos:str = line[0] |             word_pos:str = line[0] | ||||||
|  | @ -55,18 +47,43 @@ def readKoreanData(path: str) -> List[Sentence]: | ||||||
|             sentence.append(words[0],line[1],line[2],line[3]) |             sentence.append(words[0],line[1],line[2],line[3]) | ||||||
|         ret.append(sentence) |         ret.append(sentence) | ||||||
|      |      | ||||||
|  |     fp.close() | ||||||
|     return ret |     return ret | ||||||
| 
 | 
 | ||||||
|  | def readEnglishData(fp: TextIOWrapper) -> List[Sentence]: | ||||||
|  |     ret = [] | ||||||
|  |     for lines in readDataList(fp,sep=" "): | ||||||
|  |         if len(lines) == 1 and lines[0][0] == "-DOCSTART-": | ||||||
|  |             continue | ||||||
|  |         sentence = Sentence([],[],[],[]) | ||||||
|  |         for line in lines: | ||||||
|  |             sentence.append(line[0],line[1],line[2],line[3]) | ||||||
|  |         ret.append(sentence) | ||||||
|  |     return ret | ||||||
| 
 | 
 | ||||||
| def readKoreanDataAll(): | def readKoreanDataAll(): | ||||||
|     """ |     """ | ||||||
|         @return train, dev, test tuple |  | ||||||
|         Each entry is structured as follows: |         Each entry is structured as follows: | ||||||
|             POS,  |             POS,  | ||||||
|  |         Return: train, dev, test tuple | ||||||
|     """ |     """ | ||||||
|     dev = readKoreanData(f"{KoreanBase}/dev.txt") |     with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp: | ||||||
|     test = readKoreanData(f"{KoreanBase}/test.txt") |         dev = readKoreanData(fp) | ||||||
|     train = readKoreanData(f"{KoreanBase}/train.txt") |     with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp: | ||||||
|  |         test = readKoreanData(fp) | ||||||
|  |     with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp: | ||||||
|  |         train = readKoreanData(fp) | ||||||
|  |     return train, dev, test | ||||||
|  | 
 | ||||||
|  | def readEnglishDataAll(): | ||||||
|  |     with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp: | ||||||
|  |         print("a") | ||||||
|  |         dev = readEnglishData(fp) | ||||||
|  |         print("b") | ||||||
|  |     with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp: | ||||||
|  |         test = readEnglishData(fp) | ||||||
|  |     with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp: | ||||||
|  |         train = readEnglishData(fp) | ||||||
|     return train, dev, test |     return train, dev, test | ||||||
| 
 | 
 | ||||||
| class TagIdConverter: | class TagIdConverter: | ||||||
|  | @ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c): | ||||||
|             break |             break | ||||||
|     return ret |     return ret | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| """ | """ | ||||||
| extracts and stores tags set from the given data.  | extracts and stores tags set from the given data.  | ||||||
| """ | """ | ||||||
|  | @ -160,7 +175,7 @@ if __name__ == "__main__": | ||||||
|     from tqdm import tqdm |     from tqdm import tqdm | ||||||
|     t = TagIdConverter() |     t = TagIdConverter() | ||||||
|      |      | ||||||
|     train, dev, test = readKoreanDataAll() |     train, dev, test = readEnglishDataAll() | ||||||
|     vocab = set() |     vocab = set() | ||||||
|     def getTags(lst: List[Sentence]): |     def getTags(lst: List[Sentence]): | ||||||
|         for s in tqdm(lst): |         for s in tqdm(lst): | ||||||
|  | @ -173,7 +188,6 @@ if __name__ == "__main__": | ||||||
|     print("get tags from test...") |     print("get tags from test...") | ||||||
|     getTags(test) |     getTags(test) | ||||||
|     print(vocab) |     print(vocab) | ||||||
| 
 |  | ||||||
|     for v in vocab: |     for v in vocab: | ||||||
|         if v == "O": |         if v == "O": | ||||||
|             continue |             continue | ||||||
|  | @ -183,7 +197,6 @@ if __name__ == "__main__": | ||||||
|         if not v in vocab: |         if not v in vocab: | ||||||
|             print("could not found pair " ,v) |             print("could not found pair " ,v) | ||||||
|             vocab.add(v) |             vocab.add(v) | ||||||
| 
 |  | ||||||
|     tags = [{"name":"[PAD]","index":0}] |     tags = [{"name":"[PAD]","index":0}] | ||||||
|     i = 1 |     i = 1 | ||||||
|     vocab_list = [*vocab] |     vocab_list = [*vocab] | ||||||
|  | @ -192,5 +205,5 @@ if __name__ == "__main__": | ||||||
|         tags.append({"name":v,"index":i}) |         tags.append({"name":v,"index":i}) | ||||||
|         i += 1 |         i += 1 | ||||||
|     print(tags) |     print(tags) | ||||||
|     with open("tags.json","w",encoding="utf-8") as fp: |     #with open("tags.json","w",encoding="utf-8") as fp: | ||||||
|         json.dump(tags,fp,ensure_ascii=False, indent=2) |     #    json.dump(tags,fp,ensure_ascii=False, indent=2) | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue