{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "2b263b84", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'C:\\\\Users\\\\Monoid\\\\anaconda3\\\\envs\\\\nn\\\\python.exe'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sys\n", "sys.executable" ] }, { "cell_type": "markdown", "id": "610a9887", "metadata": {}, "source": [ "개발 환경 확인" ] }, { "cell_type": "markdown", "id": "5d2b9307", "metadata": {}, "source": [ "먼저 데이터를 다운로드를 받아야 한다. 다운로드 스크립트(`download.py`)가 있으니 실행하면 된다." ] }, { "cell_type": "code", "execution_count": 11, "id": "5203952d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "Extracting: 0%| | 0/14740 [00:00 10:\n", " break" ] }, { "cell_type": "markdown", "id": "245e9b05", "metadata": {}, "source": [ "잘 읽히는 것을 볼 수가 있다." ] }, { "cell_type": "code", "execution_count": 7, "id": "a9efd391", "metadata": {}, "outputs": [], "source": [ "from IPython.display import HTML" ] }, { "cell_type": "code", "execution_count": 10, "id": "36b7c38e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "from io import TextIOWrapper\n", "from typing import List, Union\n", "import os\n", "import csv\n", "from dataclasses import dataclass\n", "import tqdm\n", "@dataclass\n", "class NsmcRawData:\n", " id: int\n", " document: str\n", " label: int\n", "\n", "class NsmcRawDataReader:\n", " def __init__(self, file: Union[str, TextIOWrapper]):\n", " self.fp = file\n", " self.need_close = isinstance(file,str)\n", " if self.need_close:\n", " self.fp = open(file,\"r\",encoding=\"utf-8\",newline='\\n')\n", " self.rd = csv.DictReader(self.fp,delimiter='\\t')\n", "\n", " def __iter__(self):\n", " mapper = lambda data: NsmcRawData(int(data[\"id\"]),data[\"document\"],int(data[\"label\"]))\n", " return iter(map(mapper,self.rd))\n", " \n", " def close(self):\n", " if self.need_close:\n", " self.fp.close()\n", " \n", " def __enter__(self):\n", " return self\n", " def __exit__(self, exc_type, exc_val, exc_tb):\n", " self.close()\n", "\n", "def readNsmcRawData(file: Union[str, TextIOWrapper], use_tqdm = False, total: int = 0) -> List[NsmcRawData]:\n", " dataset = []\n", " with NsmcRawDataReader(file) as dataReader:\n", " if use_tqdm and total > 0:\n", " for d in tqdm.tqdm(dataReader, total=total):\n", " dataset.append(d)\n", " else:\n", " for data in dataReader:\n", " dataset.append(data)\n", " return dataset\n", "\n", "BASE_PATH = \"nsmc/nsmc-master\"\n", "\n", "if __name__ == \"__main__\":\n", " dataset = []\n", " raw = readNsmcRawData(f\"{BASE_PATH}/ratings.txt\", use_tqdm= True, total = 200000)" ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HTML(''+open('ndata.py',\"r\").read()+\"\")" ] }, { "cell_type": "markdown", "id": "feddd692", "metadata": {}, "source": [ "그래서 다음과 같이 코드를 짰다." ] }, { "cell_type": "code", "execution_count": null, "id": "251c75dd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" } }, "nbformat": 4, "nbformat_minor": 5 }