diff --git a/Dataset.ipynb b/Dataset.ipynb new file mode 100644 index 0000000..3715414 --- /dev/null +++ b/Dataset.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2b263b84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C:\\\\Users\\\\Monoid\\\\anaconda3\\\\envs\\\\nn\\\\python.exe'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sys\n", + "sys.executable" + ] + }, + { + "cell_type": "markdown", + "id": "610a9887", + "metadata": {}, + "source": [ + "개발 환경 확인" + ] + }, + { + "cell_type": "markdown", + "id": "5d2b9307", + "metadata": {}, + "source": [ + "먼저 데이터를 다운로드를 받아야 한다. 다운로드 스크립트(`download.py`)가 있으니 실행하면 된다." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5203952d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Extracting: 0%| | 0/14740 [00:00 10:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "245e9b05", + "metadata": {}, + "source": [ + "잘 읽히는 것을 볼 수가 있다." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a9efd391", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import HTML" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "36b7c38e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "from io import TextIOWrapper\n", + "from typing import List, Union\n", + "import os\n", + "import csv\n", + "from dataclasses import dataclass\n", + "import tqdm\n", + "@dataclass\n", + "class NsmcRawData:\n", + " id: int\n", + " document: str\n", + " label: int\n", + "\n", + "class NsmcRawDataReader:\n", + " def __init__(self, file: Union[str, TextIOWrapper]):\n", + " self.fp = file\n", + " self.need_close = isinstance(file,str)\n", + " if self.need_close:\n", + " self.fp = open(file,\"r\",encoding=\"utf-8\",newline='\\n')\n", + " self.rd = csv.DictReader(self.fp,delimiter='\\t')\n", + "\n", + " def __iter__(self):\n", + " mapper = lambda data: NsmcRawData(int(data[\"id\"]),data[\"document\"],int(data[\"label\"]))\n", + " return iter(map(mapper,self.rd))\n", + " \n", + " def close(self):\n", + " if self.need_close:\n", + " self.fp.close()\n", + " \n", + " def __enter__(self):\n", + " return self\n", + " def __exit__(self, exc_type, exc_val, exc_tb):\n", + " self.close()\n", + "\n", + "def readNsmcRawData(file: Union[str, TextIOWrapper], use_tqdm = False, total: int = 0) -> List[NsmcRawData]:\n", + " dataset = []\n", + " with NsmcRawDataReader(file) as dataReader:\n", + " if use_tqdm and total > 0:\n", + " for d in tqdm.tqdm(dataReader, total=total):\n", + " dataset.append(d)\n", + " else:\n", + " for data in dataReader:\n", + " dataset.append(data)\n", + " return dataset\n", + "\n", + "BASE_PATH = \"nsmc/nsmc-master\"\n", + "\n", + "if __name__ == \"__main__\":\n", + " dataset = []\n", + " raw = readNsmcRawData(f\"{BASE_PATH}/ratings.txt\", use_tqdm= True, total = 200000)" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HTML(''+open('ndata.py',\"r\").read()+\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "feddd692", + "metadata": {}, + "source": [ + "그래서 다음과 같이 코드를 짰다." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "251c75dd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}