commit 99a88c6da54517435c8c813a3b07e8ff89958eda Author: monoid Date: Wed Feb 23 17:48:39 2022 +0900 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e93d762 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +nsmc/**/* +nsmc.zip \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..44f6d61 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# nsmc bert + diff --git a/download.py b/download.py new file mode 100644 index 0000000..747b6bd --- /dev/null +++ b/download.py @@ -0,0 +1,25 @@ +import requests +import tqdm +import zipfile +import os + +def downloadNsmc(filename = "nsmc.zip"): + URL = "https://github.com/e9t/nsmc/archive/refs/heads/master.zip" + + headReq = requests.head(URL, headers={'Accept-Encoding': None}) + length = headReq.headers["Content-Length"] + with requests.get(URL, stream= True) as nsmc: + nsmc.raise_for_status() + with open(filename ,"wb") as fp: + t = tqdm.tqdm(total=int(length), unit='byte', desc=filename) + for chunk in nsmc.iter_content(chunk_size = 8192): + fp.write(chunk) + t.update(len(chunk)) + t.close() + +if __name__ == "__main__": + os.makedirs("nsmc") + if not os.path.exists("nsmc.zip"): + downloadNsmc() + with zipfile.ZipFile("nsmc.zip") as nsmc: + nsmc.extractall("nsmc") \ No newline at end of file diff --git a/ndata.py b/ndata.py new file mode 100644 index 0000000..5be1fb2 --- /dev/null +++ b/ndata.py @@ -0,0 +1,4 @@ +import os + +BASE_PATH = "nsmc/nsmc-master" +