ner-study/read_data.py

196 lines
4.9 KiB
Python
Raw Normal View History

2022-02-13 17:34:03 +09:00
import enum
from typing import NamedTuple, List, Sequence, TypeVar
import json
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
class Sentence(NamedTuple):
word: List[str]
pos: List[str]
"""
POS
"""
namedEntity: List[str]
"""
Named Entity
"""
detail: List[str]
"""
Named Entity Detail
"""
def append(self,word,pos,namedEntity,detail):
self.word.append(word)
self.pos.append(pos)
self.namedEntity.append(namedEntity)
self.detail.append(detail)
T = TypeVar('T')
def readDataList(lst: List[T]):
ret = []
for l in lst:
if len(l) > 0:
ret.append(l)
else:
yield ret
ret.clear()
def readKoreanData(path: str) -> List[Sentence]:
fp = open(path,encoding="utf-8")
data = []
for line in fp.readlines():
line = line.strip()
if line == "":
data.append([])
else:
data.append(line.split("\t"))
fp.close()
# Do not use csv reader.
ret = []
for lines in readDataList(data):
sentence = Sentence([],[],[],[])
for line in lines:
word_pos:str = line[0]
words = word_pos.split("/")
sentence.append(words[0],line[1],line[2],line[3])
ret.append(sentence)
return ret
def readKoreanDataAll():
"""
@return train, dev, test tuple
Each entry is structured as follows:
POS,
"""
dev = readKoreanData(f"{KoreanBase}/dev.txt")
test = readKoreanData(f"{KoreanBase}/test.txt")
train = readKoreanData(f"{KoreanBase}/train.txt")
return train, dev, test
class TagIdConverter:
def __init__(self, dict_path = "tags.json") -> None:
with open(dict_path,"r+",encoding="utf-8") as fp:
data = json.load(fp)
self.vocab = {}
for item in data:
self.vocab[item["name"]] = item["index"]
self.ids_to_token = {}
for item in data:
self.ids_to_token[item["index"]] = item["name"]
@property
def O_id(self):
return self.vocab["O"]
@property
def pad_id(self):
return self.vocab["[PAD]"]
def convert_ids_to_tokens(self,ids: List[int]):
return [self.ids_to_token[id] for id in ids]
def convert_tokens_to_ids(self, tokens: List[str]):
return [self.vocab[tt] for tt in tokens]
class MatchState(enum.IntEnum):
MATCH = 0
BEGIN = 1
INTER = 2
def match_indexes(a,b) -> Sequence[MatchState]:
s = iter(b)
v = ""
try:
v = next(s)
except StopIteration:
return
for k in a:
try:
if k == v:
yield MatchState.MATCH, k, v
v = next(s)
else:
yield MatchState.BEGIN, k, v
cum = v
while True:
v: str = next(s)
cum += v.strip("#")
yield MatchState.INTER, k, v
if k == cum:
v = next(s)
break
except StopIteration:
break
def make_long_namedEntity(a,b,c):
it = iter(c)
ret = []
entityType = ""
o = False
for s,_,_ in match_indexes(a,b):
try:
if s == MatchState.MATCH:
v = next(it)
ret.append(v)
elif s == MatchState.BEGIN:
v = next(it)
ret.append(v)
if v == "O":
o = True
else:
vv = v.split("-")
entityType = vv[1]
o = False
elif s == MatchState.INTER:
if o:
ret.append("O")
else:
ret.append(f"I-{entityType}")
except StopIteration:
break
return ret
"""
extracts and stores tags set from the given data.
"""
if __name__ == "__main__":
from tqdm import tqdm
t = TagIdConverter()
train, dev, test = readKoreanDataAll()
vocab = set()
def getTags(lst: List[Sentence]):
for s in tqdm(lst):
for e in s.detail:
2022-02-14 22:23:35 +09:00
vocab.add(e)
2022-02-13 17:34:03 +09:00
print("get tags from train...")
getTags(train)
print("get tags from dev...")
getTags(dev)
print("get tags from test...")
getTags(test)
print(vocab)
for v in vocab:
if v == "O":
continue
s = v.split("-")
s[0] = {"B":"I","I":"B"}[(s[0])]
v:str = "-".join(s)
if not v in vocab:
print("could not found pair " ,v)
2022-02-14 22:23:35 +09:00
vocab.add(v)
2022-02-13 17:34:03 +09:00
tags = [{"name":"[PAD]","index":0}]
i = 1
vocab_list = [*vocab]
vocab_list.sort()
for v in vocab_list:
tags.append({"name":v,"index":i})
i += 1
print(tags)
with open("tags.json","w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2)