commit 312021484e275849e87469607098db234173e214 Author: monoid Date: Mon Nov 28 20:34:02 2022 +0900 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..20499b8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +results/ +results_hidden/ +test_mlp/ + +22ML_hw3.zip +hw3/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class diff --git a/defaultAccuracyGraph.png b/defaultAccuracyGraph.png new file mode 100644 index 0000000..9f6b46a Binary files /dev/null and b/defaultAccuracyGraph.png differ diff --git a/defaultLossGraph.png b/defaultLossGraph.png new file mode 100644 index 0000000..4068294 Binary files /dev/null and b/defaultLossGraph.png differ diff --git a/experiments.json b/experiments.json new file mode 100644 index 0000000..087bb86 --- /dev/null +++ b/experiments.json @@ -0,0 +1,530 @@ +{ + "remain_experiments": [], + "completed_experiment_results": [ + { + "lr": 0.1, + "momentum": 0.0, + "mini_batch_size": 2, + "save_dir": "results/lr=0.1_momentum=0.0_batch_size=2", + "time": 28.41178321838379 + }, + { + "lr": 0.1, + "momentum": 0.0, + "mini_batch_size": 14, + "save_dir": "results/lr=0.1_momentum=0.0_batch_size=14", + "time": 51.77998423576355 + }, + { + "lr": 0.1, + "momentum": 0.0, + "mini_batch_size": 65, + "save_dir": "results/lr=0.1_momentum=0.0_batch_size=65", + "time": 37.20900106430054 + }, + { + "lr": 0.1, + "momentum": 0.0, + "mini_batch_size": 241, + "save_dir": "results/lr=0.1_momentum=0.0_batch_size=241", + "time": 18.352060556411743 + }, + { + "lr": 0.1, + "momentum": 0.0, + "mini_batch_size": 844, + "save_dir": "results/lr=0.1_momentum=0.0_batch_size=844", + "time": 13.6323721408844 + }, + { + "lr": 0.1, + "momentum": 0.5, + "mini_batch_size": 2, + "save_dir": "results/lr=0.1_momentum=0.5_batch_size=2", + "time": 36.67720055580139 + }, + { + "lr": 0.1, + "momentum": 0.5, + "mini_batch_size": 14, + "save_dir": "results/lr=0.1_momentum=0.5_batch_size=14", + "time": 30.415109634399414 + }, + { + "lr": 0.1, + "momentum": 0.5, + "mini_batch_size": 65, + "save_dir": "results/lr=0.1_momentum=0.5_batch_size=65", + "time": 10.964645147323608 + }, + { + "lr": 0.1, + "momentum": 0.5, + "mini_batch_size": 241, + "save_dir": "results/lr=0.1_momentum=0.5_batch_size=241", + "time": 8.89481782913208 + }, + { + "lr": 0.1, + "momentum": 0.5, + "mini_batch_size": 844, + "save_dir": "results/lr=0.1_momentum=0.5_batch_size=844", + "time": 8.2913076877594 + }, + { + "lr": 0.1, + "momentum": 0.9, + "mini_batch_size": 2, + "save_dir": "results/lr=0.1_momentum=0.9_batch_size=2", + "time": 62.4284188747406 + }, + { + "lr": 0.1, + "momentum": 0.9, + "mini_batch_size": 14, + "save_dir": "results/lr=0.1_momentum=0.9_batch_size=14", + "time": 26.73504114151001 + }, + { + "lr": 0.1, + "momentum": 0.9, + "mini_batch_size": 65, + "save_dir": "results/lr=0.1_momentum=0.9_batch_size=65", + "time": 13.481291055679321 + }, + { + "lr": 0.1, + "momentum": 0.9, + "mini_batch_size": 241, + "save_dir": "results/lr=0.1_momentum=0.9_batch_size=241", + "time": 5.815146207809448 + }, + { + "lr": 0.1, + "momentum": 0.9, + "mini_batch_size": 844, + "save_dir": "results/lr=0.1_momentum=0.9_batch_size=844", + "time": 5.680160999298096 + }, + { + "lr": 0.05, + "momentum": 0.0, + "mini_batch_size": 2, + "save_dir": "results/lr=0.05_momentum=0.0_batch_size=2", + "time": 24.537229299545288 + }, + { + "lr": 0.05, + "momentum": 0.0, + "mini_batch_size": 14, + "save_dir": "results/lr=0.05_momentum=0.0_batch_size=14", + "time": 32.553603172302246 + }, + { + "lr": 0.05, + "momentum": 0.0, + "mini_batch_size": 65, + "save_dir": "results/lr=0.05_momentum=0.0_batch_size=65", + "time": 22.78046178817749 + }, + { + "lr": 0.05, + "momentum": 0.0, + "mini_batch_size": 241, + "save_dir": "results/lr=0.05_momentum=0.0_batch_size=241", + "time": 26.703359127044678 + }, + { + "lr": 0.05, + "momentum": 0.0, + "mini_batch_size": 844, + "save_dir": "results/lr=0.05_momentum=0.0_batch_size=844", + "time": 45.83435368537903 + }, + { + "lr": 0.05, + "momentum": 0.5, + "mini_batch_size": 2, + "save_dir": "results/lr=0.05_momentum=0.5_batch_size=2", + "time": 36.79327321052551 + }, + { + "lr": 0.05, + "momentum": 0.5, + "mini_batch_size": 14, + "save_dir": "results/lr=0.05_momentum=0.5_batch_size=14", + "time": 37.78914451599121 + }, + { + "lr": 0.05, + "momentum": 0.5, + "mini_batch_size": 65, + "save_dir": "results/lr=0.05_momentum=0.5_batch_size=65", + "time": 15.176773071289062 + }, + { + "lr": 0.05, + "momentum": 0.5, + "mini_batch_size": 241, + "save_dir": "results/lr=0.05_momentum=0.5_batch_size=241", + "time": 16.030482530593872 + }, + { + "lr": 0.05, + "momentum": 0.5, + "mini_batch_size": 844, + "save_dir": "results/lr=0.05_momentum=0.5_batch_size=844", + "time": 17.813740730285645 + }, + { + "lr": 0.05, + "momentum": 0.9, + "mini_batch_size": 2, + "save_dir": "results/lr=0.05_momentum=0.9_batch_size=2", + "time": 50.95525074005127 + }, + { + "lr": 0.05, + "momentum": 0.9, + "mini_batch_size": 14, + "save_dir": "results/lr=0.05_momentum=0.9_batch_size=14", + "time": 16.846729516983032 + }, + { + "lr": 0.05, + "momentum": 0.9, + "mini_batch_size": 65, + "save_dir": "results/lr=0.05_momentum=0.9_batch_size=65", + "time": 6.8179051876068115 + }, + { + "lr": 0.05, + "momentum": 0.9, + "mini_batch_size": 241, + "save_dir": "results/lr=0.05_momentum=0.9_batch_size=241", + "time": 10.041207075119019 + }, + { + "lr": 0.05, + "momentum": 0.9, + "mini_batch_size": 844, + "save_dir": "results/lr=0.05_momentum=0.9_batch_size=844", + "time": 14.046262979507446 + }, + { + "lr": 0.01, + "momentum": 0.0, + "mini_batch_size": 2, + "save_dir": "results/lr=0.01_momentum=0.0_batch_size=2", + "time": 47.5756618976593 + }, + { + "lr": 0.01, + "momentum": 0.0, + "mini_batch_size": 14, + "save_dir": "results/lr=0.01_momentum=0.0_batch_size=14", + "time": 38.926867723464966 + }, + { + "lr": 0.01, + "momentum": 0.0, + "mini_batch_size": 65, + "save_dir": "results/lr=0.01_momentum=0.0_batch_size=65", + "time": 43.19446611404419 + }, + { + "lr": 0.01, + "momentum": 0.0, + "mini_batch_size": 241, + "save_dir": "results/lr=0.01_momentum=0.0_batch_size=241", + "time": 66.58777904510498 + }, + { + "lr": 0.01, + "momentum": 0.0, + "mini_batch_size": 844, + "save_dir": "results/lr=0.01_momentum=0.0_batch_size=844", + "time": 74.81579160690308 + }, + { + "lr": 0.01, + "momentum": 0.5, + "mini_batch_size": 2, + "save_dir": "results/lr=0.01_momentum=0.5_batch_size=2", + "time": 54.7407808303833 + }, + { + "lr": 0.01, + "momentum": 0.5, + "mini_batch_size": 14, + "save_dir": "results/lr=0.01_momentum=0.5_batch_size=14", + "time": 25.720898389816284 + }, + { + "lr": 0.01, + "momentum": 0.5, + "mini_batch_size": 65, + "save_dir": "results/lr=0.01_momentum=0.5_batch_size=65", + "time": 22.998470306396484 + }, + { + "lr": 0.01, + "momentum": 0.5, + "mini_batch_size": 241, + "save_dir": "results/lr=0.01_momentum=0.5_batch_size=241", + "time": 39.0310754776001 + }, + { + "lr": 0.01, + "momentum": 0.5, + "mini_batch_size": 844, + "save_dir": "results/lr=0.01_momentum=0.5_batch_size=844", + "time": 82.8304545879364 + }, + { + "lr": 0.01, + "momentum": 0.9, + "mini_batch_size": 2, + "save_dir": "results/lr=0.01_momentum=0.9_batch_size=2", + "time": 28.122719287872314 + }, + { + "lr": 0.01, + "momentum": 0.9, + "mini_batch_size": 14, + "save_dir": "results/lr=0.01_momentum=0.9_batch_size=14", + "time": 45.34721541404724 + }, + { + "lr": 0.01, + "momentum": 0.9, + "mini_batch_size": 65, + "save_dir": "results/lr=0.01_momentum=0.9_batch_size=65", + "time": 14.570504665374756 + }, + { + "lr": 0.01, + "momentum": 0.9, + "mini_batch_size": 241, + "save_dir": "results/lr=0.01_momentum=0.9_batch_size=241", + "time": 13.945464611053467 + }, + { + "lr": 0.01, + "momentum": 0.9, + "mini_batch_size": 844, + "save_dir": "results/lr=0.01_momentum=0.9_batch_size=844", + "time": 26.580908060073853 + }, + { + "lr": 0.005, + "momentum": 0.0, + "mini_batch_size": 2, + "save_dir": "results/lr=0.005_momentum=0.0_batch_size=2", + "time": 37.00913667678833 + }, + { + "lr": 0.005, + "momentum": 0.0, + "mini_batch_size": 14, + "save_dir": "results/lr=0.005_momentum=0.0_batch_size=14", + "time": 48.5772442817688 + }, + { + "lr": 0.005, + "momentum": 0.0, + "mini_batch_size": 65, + "save_dir": "results/lr=0.005_momentum=0.0_batch_size=65", + "time": 68.84409880638123 + }, + { + "lr": 0.005, + "momentum": 0.0, + "mini_batch_size": 241, + "save_dir": "results/lr=0.005_momentum=0.0_batch_size=241", + "time": 78.7110869884491 + }, + { + "lr": 0.005, + "momentum": 0.0, + "mini_batch_size": 844, + "save_dir": "results/lr=0.005_momentum=0.0_batch_size=844", + "time": 76.47329998016357 + }, + { + "lr": 0.005, + "momentum": 0.5, + "mini_batch_size": 2, + "save_dir": "results/lr=0.005_momentum=0.5_batch_size=2", + "time": 49.36377739906311 + }, + { + "lr": 0.005, + "momentum": 0.5, + "mini_batch_size": 14, + "save_dir": "results/lr=0.005_momentum=0.5_batch_size=14", + "time": 36.931933641433716 + }, + { + "lr": 0.005, + "momentum": 0.5, + "mini_batch_size": 65, + "save_dir": "results/lr=0.005_momentum=0.5_batch_size=65", + "time": 31.2841374874115 + }, + { + "lr": 0.005, + "momentum": 0.5, + "mini_batch_size": 241, + "save_dir": "results/lr=0.005_momentum=0.5_batch_size=241", + "time": 69.75861930847168 + }, + { + "lr": 0.005, + "momentum": 0.5, + "mini_batch_size": 844, + "save_dir": "results/lr=0.005_momentum=0.5_batch_size=844", + "time": 75.49948239326477 + }, + { + "lr": 0.005, + "momentum": 0.9, + "mini_batch_size": 2, + "save_dir": "results/lr=0.005_momentum=0.9_batch_size=2", + "time": 35.81554460525513 + }, + { + "lr": 0.005, + "momentum": 0.9, + "mini_batch_size": 14, + "save_dir": "results/lr=0.005_momentum=0.9_batch_size=14", + "time": 19.310007095336914 + }, + { + "lr": 0.005, + "momentum": 0.9, + "mini_batch_size": 65, + "save_dir": "results/lr=0.005_momentum=0.9_batch_size=65", + "time": 18.249040842056274 + }, + { + "lr": 0.005, + "momentum": 0.9, + "mini_batch_size": 241, + "save_dir": "results/lr=0.005_momentum=0.9_batch_size=241", + "time": 24.462805032730103 + }, + { + "lr": 0.005, + "momentum": 0.9, + "mini_batch_size": 844, + "save_dir": "results/lr=0.005_momentum=0.9_batch_size=844", + "time": 39.130101442337036 + }, + { + "lr": 0.001, + "momentum": 0.0, + "mini_batch_size": 2, + "save_dir": "results/lr=0.001_momentum=0.0_batch_size=2", + "time": 68.75053930282593 + }, + { + "lr": 0.001, + "momentum": 0.0, + "mini_batch_size": 14, + "save_dir": "results/lr=0.001_momentum=0.0_batch_size=14", + "time": 95.08529353141785 + }, + { + "lr": 0.001, + "momentum": 0.0, + "mini_batch_size": 65, + "save_dir": "results/lr=0.001_momentum=0.0_batch_size=65", + "time": 99.59149670600891 + }, + { + "lr": 0.001, + "momentum": 0.0, + "mini_batch_size": 241, + "save_dir": "results/lr=0.001_momentum=0.0_batch_size=241", + "time": 76.81180620193481 + }, + { + "lr": 0.001, + "momentum": 0.0, + "mini_batch_size": 844, + "save_dir": "results/lr=0.001_momentum=0.0_batch_size=844", + "time": 72.93008518218994 + }, + { + "lr": 0.001, + "momentum": 0.5, + "mini_batch_size": 2, + "save_dir": "results/lr=0.001_momentum=0.5_batch_size=2", + "time": 46.15296936035156 + }, + { + "lr": 0.001, + "momentum": 0.5, + "mini_batch_size": 14, + "save_dir": "results/lr=0.001_momentum=0.5_batch_size=14", + "time": 74.169504404068 + }, + { + "lr": 0.001, + "momentum": 0.5, + "mini_batch_size": 65, + "save_dir": "results/lr=0.001_momentum=0.5_batch_size=65", + "time": 88.55051469802856 + }, + { + "lr": 0.001, + "momentum": 0.5, + "mini_batch_size": 241, + "save_dir": "results/lr=0.001_momentum=0.5_batch_size=241", + "time": 77.15520548820496 + }, + { + "lr": 0.001, + "momentum": 0.5, + "mini_batch_size": 844, + "save_dir": "results/lr=0.001_momentum=0.5_batch_size=844", + "time": 74.80425786972046 + }, + { + "lr": 0.001, + "momentum": 0.9, + "mini_batch_size": 2, + "save_dir": "results/lr=0.001_momentum=0.9_batch_size=2", + "time": 39.646984577178955 + }, + { + "lr": 0.001, + "momentum": 0.9, + "mini_batch_size": 14, + "save_dir": "results/lr=0.001_momentum=0.9_batch_size=14", + "time": 30.707565307617188 + }, + { + "lr": 0.001, + "momentum": 0.9, + "mini_batch_size": 65, + "save_dir": "results/lr=0.001_momentum=0.9_batch_size=65", + "time": 35.97918176651001 + }, + { + "lr": 0.001, + "momentum": 0.9, + "mini_batch_size": 241, + "save_dir": "results/lr=0.001_momentum=0.9_batch_size=241", + "time": 58.797680139541626 + }, + { + "lr": 0.001, + "momentum": 0.9, + "mini_batch_size": 844, + "save_dir": "results/lr=0.001_momentum=0.9_batch_size=844", + "time": 74.01161527633667 + } + ] +} \ No newline at end of file diff --git a/experiments_hidden.json b/experiments_hidden.json new file mode 100644 index 0000000..ce54408 --- /dev/null +++ b/experiments_hidden.json @@ -0,0 +1,691 @@ +{ + "remain_experiments": [], + "completed_experiment_results": [ + { + "num_hiddens": [ + 2, + 2 + ], + "save_dir": "results_hidden/2_2", + "test_acc": 0.5714285714285714, + "test_ce": 1.1883265018067433, + "train_acc": 0.5995850622406639, + "train_ce": 1.0621748825074904, + "valid_acc": 0.6014319809069213, + "valid_ce": 1.2018897073103385, + "time": 35.80903363227844 + }, + { + "num_hiddens": [ + 2, + 4 + ], + "save_dir": "results_hidden/2_4", + "test_acc": 0.3168831168831169, + "test_ce": 1.8386744068829908, + "train_acc": 0.2726733847065797, + "train_ce": 1.9393003115898362, + "valid_acc": 0.27923627684964203, + "valid_ce": 1.9272044325512694, + "time": 9.895411014556885 + }, + { + "num_hiddens": [ + 2, + 8 + ], + "save_dir": "results_hidden/2_8", + "test_acc": 0.5818181818181818, + "test_ce": 1.103191264144406, + "train_acc": 0.6381149970361588, + "train_ce": 0.9764487928491248, + "valid_acc": 0.6276849642004774, + "valid_ce": 1.1694202147502029, + "time": 17.842985153198242 + }, + { + "num_hiddens": [ + 2, + 16 + ], + "save_dir": "results_hidden/2_16", + "test_acc": 0.5324675324675324, + "test_ce": 1.3175016715487053, + "train_acc": 0.534973325429757, + "train_ce": 1.2472188336243626, + "valid_acc": 0.5369928400954654, + "valid_ce": 1.3007100604671007, + "time": 26.737168788909912 + }, + { + "num_hiddens": [ + 2, + 32 + ], + "save_dir": "results_hidden/2_32", + "test_acc": 0.574025974025974, + "test_ce": 1.1028347076278828, + "train_acc": 0.6665678719620628, + "train_ce": 0.9337453653443895, + "valid_acc": 0.6324582338902148, + "valid_ce": 1.13878659615313, + "time": 25.09254479408264 + }, + { + "num_hiddens": [ + 2, + 64 + ], + "save_dir": "results_hidden/2_64", + "test_acc": 0.5246753246753246, + "test_ce": 1.344395789386107, + "train_acc": 0.495850622406639, + "train_ce": 1.380705929807397, + "valid_acc": 0.5202863961813843, + "valid_ce": 1.3925691134172038, + "time": 23.93467116355896 + }, + { + "num_hiddens": [ + 2, + 100 + ], + "save_dir": "results_hidden/2_100", + "test_acc": 0.5818181818181818, + "test_ce": 1.0873695787125757, + "train_acc": 0.6357439241256668, + "train_ce": 0.9777181114452983, + "valid_acc": 0.6205250596658711, + "valid_ce": 1.1207118389563573, + "time": 18.33916211128235 + }, + { + "num_hiddens": [ + 4, + 2 + ], + "save_dir": "results_hidden/4_2", + "test_acc": 0.561038961038961, + "test_ce": 1.0956761800519494, + "train_acc": 0.6576763485477178, + "train_ce": 0.9297563951467718, + "valid_acc": 0.6324582338902148, + "valid_ce": 1.1263616883078866, + "time": 18.195006370544434 + }, + { + "num_hiddens": [ + 4, + 4 + ], + "save_dir": "results_hidden/4_4", + "test_acc": 0.6779220779220779, + "test_ce": 0.9642773216966196, + "train_acc": 0.7868998221695317, + "train_ce": 0.6032810477588412, + "valid_acc": 0.7159904534606205, + "valid_ce": 0.8563414428923701, + "time": 30.676226139068604 + }, + { + "num_hiddens": [ + 4, + 8 + ], + "save_dir": "results_hidden/4_8", + "test_acc": 0.587012987012987, + "test_ce": 1.086920456137629, + "train_acc": 0.6232957913455839, + "train_ce": 1.0288384336153955, + "valid_acc": 0.6109785202863962, + "valid_ce": 1.1296178001326305, + "time": 18.145662307739258 + }, + { + "num_hiddens": [ + 4, + 16 + ], + "save_dir": "results_hidden/4_16", + "test_acc": 0.6857142857142857, + "test_ce": 0.8784320934740455, + "train_acc": 0.7442205097806758, + "train_ce": 0.6975511975824222, + "valid_acc": 0.6825775656324582, + "valid_ce": 0.8526744343887943, + "time": 14.786683082580566 + }, + { + "num_hiddens": [ + 4, + 32 + ], + "save_dir": "results_hidden/4_32", + "test_acc": 0.5714285714285714, + "test_ce": 1.1185037586350628, + "train_acc": 0.6529342027267339, + "train_ce": 0.9630479799049851, + "valid_acc": 0.6252983293556086, + "valid_ce": 1.1439494373524954, + "time": 16.73994541168213 + }, + { + "num_hiddens": [ + 4, + 64 + ], + "save_dir": "results_hidden/4_64", + "test_acc": 0.612987012987013, + "test_ce": 1.059105126470988, + "train_acc": 0.6395969176052163, + "train_ce": 0.9838368119847848, + "valid_acc": 0.6276849642004774, + "valid_ce": 1.1034921965916509, + "time": 38.15524506568909 + }, + { + "num_hiddens": [ + 4, + 100 + ], + "save_dir": "results_hidden/4_100", + "test_acc": 0.6909090909090909, + "test_ce": 0.8455726705883155, + "train_acc": 0.7596324836988737, + "train_ce": 0.6350005112948498, + "valid_acc": 0.711217183770883, + "valid_ce": 0.8639518636426027, + "time": 17.032158851623535 + }, + { + "num_hiddens": [ + 8, + 2 + ], + "save_dir": "results_hidden/8_2", + "test_acc": 0.5792207792207792, + "test_ce": 1.1296442078180708, + "train_acc": 0.6384113811499703, + "train_ce": 0.9889069528105606, + "valid_acc": 0.6252983293556086, + "valid_ce": 1.1035069248742386, + "time": 37.70230746269226 + }, + { + "num_hiddens": [ + 8, + 4 + ], + "save_dir": "results_hidden/8_4", + "test_acc": 0.6623376623376623, + "test_ce": 0.8595203270694369, + "train_acc": 0.7534084173088322, + "train_ce": 0.6962762445417804, + "valid_acc": 0.7016706443914081, + "valid_ce": 0.9502958443839282, + "time": 17.593876361846924 + }, + { + "num_hiddens": [ + 8, + 8 + ], + "save_dir": "results_hidden/8_8", + "test_acc": 0.7038961038961039, + "test_ce": 0.8337800754042419, + "train_acc": 0.7934202726733847, + "train_ce": 0.5848908502487367, + "valid_acc": 0.7231503579952268, + "valid_ce": 0.8072033771198709, + "time": 28.873700618743896 + }, + { + "num_hiddens": [ + 8, + 16 + ], + "save_dir": "results_hidden/8_16", + "test_acc": 0.7038961038961039, + "test_ce": 0.7809423193874597, + "train_acc": 0.7723770005927683, + "train_ce": 0.6250902749524869, + "valid_acc": 0.7279236276849642, + "valid_ce": 0.8152645208544564, + "time": 17.62161684036255 + }, + { + "num_hiddens": [ + 8, + 32 + ], + "save_dir": "results_hidden/8_32", + "test_acc": 0.7168831168831169, + "test_ce": 0.776042622977459, + "train_acc": 0.8103141671606402, + "train_ce": 0.5296839598330854, + "valid_acc": 0.7279236276849642, + "valid_ce": 0.7958847771876185, + "time": 17.263331651687622 + }, + { + "num_hiddens": [ + 8, + 64 + ], + "save_dir": "results_hidden/8_64", + "test_acc": 0.7376623376623377, + "test_ce": 0.6871967257220808, + "train_acc": 0.7957913455838767, + "train_ce": 0.5710957470706919, + "valid_acc": 0.7589498806682577, + "valid_ce": 0.7789058480339265, + "time": 17.318830251693726 + }, + { + "num_hiddens": [ + 8, + 100 + ], + "save_dir": "results_hidden/8_100", + "test_acc": 0.7532467532467533, + "test_ce": 0.6913332829019823, + "train_acc": 0.7943094250148192, + "train_ce": 0.5916968487617739, + "valid_acc": 0.7374701670644391, + "valid_ce": 0.8247806543568104, + "time": 16.315816402435303 + }, + { + "num_hiddens": [ + 16, + 2 + ], + "save_dir": "results_hidden/16_2", + "test_acc": 0.5844155844155844, + "test_ce": 1.1426318664361472, + "train_acc": 0.6108476585655009, + "train_ce": 1.0906332967029762, + "valid_acc": 0.6348448687350835, + "valid_ce": 1.1630470652574731, + "time": 28.151522159576416 + }, + { + "num_hiddens": [ + 16, + 4 + ], + "save_dir": "results_hidden/16_4", + "test_acc": 0.6103896103896104, + "test_ce": 0.9489044764667965, + "train_acc": 0.7216953171310018, + "train_ce": 0.7404024623315697, + "valid_acc": 0.6706443914081146, + "valid_ce": 1.0537869377500546, + "time": 27.455986738204956 + }, + { + "num_hiddens": [ + 16, + 8 + ], + "save_dir": "results_hidden/16_8", + "test_acc": 0.7376623376623377, + "test_ce": 0.7222062023899021, + "train_acc": 0.8236514522821576, + "train_ce": 0.4787140712879479, + "valid_acc": 0.7446300715990454, + "valid_ce": 0.8500082468912674, + "time": 24.222497940063477 + }, + { + "num_hiddens": [ + 16, + 16 + ], + "save_dir": "results_hidden/16_16", + "test_acc": 0.7454545454545455, + "test_ce": 0.7115067249069256, + "train_acc": 0.8541790160047421, + "train_ce": 0.41610849773293757, + "valid_acc": 0.7422434367541766, + "valid_ce": 0.8717020956329505, + "time": 24.185862064361572 + }, + { + "num_hiddens": [ + 16, + 32 + ], + "save_dir": "results_hidden/16_32", + "test_acc": 0.7454545454545455, + "test_ce": 0.7181385504472038, + "train_acc": 0.8595139300533492, + "train_ce": 0.4068310295669791, + "valid_acc": 0.747016706443914, + "valid_ce": 0.7938633153977195, + "time": 19.79218626022339 + }, + { + "num_hiddens": [ + 16, + 64 + ], + "save_dir": "results_hidden/16_64", + "test_acc": 0.7558441558441559, + "test_ce": 0.6907142403940347, + "train_acc": 0.8189093064611737, + "train_ce": 0.5024369624312522, + "valid_acc": 0.7517899761336515, + "valid_ce": 0.7456447860188432, + "time": 20.831907987594604 + }, + { + "num_hiddens": [ + 16, + 100 + ], + "save_dir": "results_hidden/16_100", + "test_acc": 0.7298701298701299, + "test_ce": 0.7376651639713366, + "train_acc": 0.8340248962655602, + "train_ce": 0.4642076721743233, + "valid_acc": 0.7589498806682577, + "valid_ce": 0.788278985554659, + "time": 18.365669012069702 + }, + { + "num_hiddens": [ + 32, + 2 + ], + "save_dir": "results_hidden/32_2", + "test_acc": 0.5974025974025974, + "test_ce": 1.0542766468255889, + "train_acc": 0.6905749851807943, + "train_ce": 0.8379298643973804, + "valid_acc": 0.630071599045346, + "valid_ce": 1.0981339361402433, + "time": 28.125181198120117 + }, + { + "num_hiddens": [ + 32, + 4 + ], + "save_dir": "results_hidden/32_4", + "test_acc": 0.6649350649350649, + "test_ce": 0.87830095162699, + "train_acc": 0.7750444576170717, + "train_ce": 0.6229575046534378, + "valid_acc": 0.6968973747016707, + "valid_ce": 0.9151727240340731, + "time": 30.2298641204834 + }, + { + "num_hiddens": [ + 32, + 8 + ], + "save_dir": "results_hidden/32_8", + "test_acc": 0.7116883116883117, + "test_ce": 0.7563488391898573, + "train_acc": 0.7854179016004742, + "train_ce": 0.5813457185835761, + "valid_acc": 0.7565632458233891, + "valid_ce": 0.8050935173229951, + "time": 20.859773874282837 + }, + { + "num_hiddens": [ + 32, + 16 + ], + "save_dir": "results_hidden/32_16", + "test_acc": 0.7610389610389611, + "test_ce": 0.7367815537177893, + "train_acc": 0.8524007113218731, + "train_ce": 0.41243020424125854, + "valid_acc": 0.7541766109785203, + "valid_ce": 0.7450861394889439, + "time": 23.25441861152649 + }, + { + "num_hiddens": [ + 32, + 32 + ], + "save_dir": "results_hidden/32_32", + "test_acc": 0.7454545454545455, + "test_ce": 0.699087546278245, + "train_acc": 0.8488441019561351, + "train_ce": 0.43960130569443095, + "valid_acc": 0.7565632458233891, + "valid_ce": 0.8179696371465912, + "time": 24.133497714996338 + }, + { + "num_hiddens": [ + 32, + 64 + ], + "save_dir": "results_hidden/32_64", + "test_acc": 0.7402597402597403, + "test_ce": 0.6816005950053298, + "train_acc": 0.8171310017783047, + "train_ce": 0.4941334483916563, + "valid_acc": 0.747016706443914, + "valid_ce": 0.765047061743974, + "time": 15.495444059371948 + }, + { + "num_hiddens": [ + 32, + 100 + ], + "save_dir": "results_hidden/32_100", + "test_acc": 0.7376623376623377, + "test_ce": 0.7380551652170162, + "train_acc": 0.8577356253704801, + "train_ce": 0.39339669650229053, + "valid_acc": 0.7804295942720764, + "valid_ce": 0.6918119270671622, + "time": 23.888253211975098 + }, + { + "num_hiddens": [ + 64, + 2 + ], + "save_dir": "results_hidden/64_2", + "test_acc": 0.3168831168831169, + "test_ce": 1.838809512757515, + "train_acc": 0.2714878482513337, + "train_ce": 1.9393289205876165, + "valid_acc": 0.27923627684964203, + "valid_ce": 1.9267377909552563, + "time": 11.168122053146362 + }, + { + "num_hiddens": [ + 64, + 4 + ], + "save_dir": "results_hidden/64_4", + "test_acc": 0.6649350649350649, + "test_ce": 0.8692490938521161, + "train_acc": 0.7581505631298162, + "train_ce": 0.6467321057973495, + "valid_acc": 0.7136038186157518, + "valid_ce": 0.8219261167566893, + "time": 20.189652681350708 + }, + { + "num_hiddens": [ + 64, + 8 + ], + "save_dir": "results_hidden/64_8", + "test_acc": 0.7558441558441559, + "test_ce": 0.7125661361785404, + "train_acc": 0.8630705394190872, + "train_ce": 0.386970779741758, + "valid_acc": 0.7541766109785203, + "valid_ce": 0.8449150052167953, + "time": 25.257745265960693 + }, + { + "num_hiddens": [ + 64, + 16 + ], + "save_dir": "results_hidden/64_16", + "test_acc": 0.7662337662337663, + "test_ce": 0.6944239927117467, + "train_acc": 0.8393598103141672, + "train_ce": 0.44708450292601815, + "valid_acc": 0.747016706443914, + "valid_ce": 0.9026907304169889, + "time": 24.140554904937744 + }, + { + "num_hiddens": [ + 64, + 32 + ], + "save_dir": "results_hidden/64_32", + "test_acc": 0.7480519480519481, + "test_ce": 0.6866846017003834, + "train_acc": 0.8553645524599881, + "train_ce": 0.39588284235318555, + "valid_acc": 0.7828162291169452, + "valid_ce": 0.7157990105825294, + "time": 32.83610534667969 + }, + { + "num_hiddens": [ + 64, + 64 + ], + "save_dir": "results_hidden/64_64", + "test_acc": 0.7428571428571429, + "test_ce": 0.6728173089077266, + "train_acc": 0.8550681683461766, + "train_ce": 0.40158346661747935, + "valid_acc": 0.7589498806682577, + "valid_ce": 0.7261777449667132, + "time": 23.787596225738525 + }, + { + "num_hiddens": [ + 64, + 100 + ], + "save_dir": "results_hidden/64_100", + "test_acc": 0.7454545454545455, + "test_ce": 0.7448447640037901, + "train_acc": 0.8844101956135151, + "train_ce": 0.3197491476691474, + "valid_acc": 0.7780429594272077, + "valid_ce": 0.7349206173691523, + "time": 25.499298572540283 + }, + { + "num_hiddens": [ + 100, + 2 + ], + "save_dir": "results_hidden/100_2", + "test_acc": 0.5974025974025974, + "test_ce": 1.033157364751078, + "train_acc": 0.7471843509187908, + "train_ce": 0.7089747143706052, + "valid_acc": 0.6706443914081146, + "valid_ce": 1.0081316571677383, + "time": 41.23928952217102 + }, + { + "num_hiddens": [ + 100, + 4 + ], + "save_dir": "results_hidden/100_4", + "test_acc": 0.6883116883116883, + "test_ce": 0.8776306751616497, + "train_acc": 0.8473621813870776, + "train_ce": 0.42027758025381884, + "valid_acc": 0.7279236276849642, + "valid_ce": 0.9112785492043114, + "time": 38.92436861991882 + }, + { + "num_hiddens": [ + 100, + 8 + ], + "save_dir": "results_hidden/100_8", + "test_acc": 0.7272727272727273, + "test_ce": 0.7080898780464847, + "train_acc": 0.8672199170124482, + "train_ce": 0.3946154105562895, + "valid_acc": 0.7565632458233891, + "valid_ce": 0.7723931893144091, + "time": 26.540124654769897 + }, + { + "num_hiddens": [ + 100, + 16 + ], + "save_dir": "results_hidden/100_16", + "test_acc": 0.7558441558441559, + "test_ce": 0.7074691190956994, + "train_acc": 0.8624777711914642, + "train_ce": 0.3836734071522731, + "valid_acc": 0.7684964200477327, + "valid_ce": 0.7615377030221268, + "time": 24.37882685661316 + }, + { + "num_hiddens": [ + 100, + 32 + ], + "save_dir": "results_hidden/100_32", + "test_acc": 0.7376623376623377, + "test_ce": 0.7631072717155784, + "train_acc": 0.8396561944279787, + "train_ce": 0.4572284188950598, + "valid_acc": 0.7589498806682577, + "valid_ce": 0.7501452107064668, + "time": 23.91041326522827 + }, + { + "num_hiddens": [ + 100, + 64 + ], + "save_dir": "results_hidden/100_64", + "test_acc": 0.7454545454545455, + "test_ce": 0.6839039631003084, + "train_acc": 0.8604030823947837, + "train_ce": 0.3869656183298651, + "valid_acc": 0.7756563245823389, + "valid_ce": 0.7299145110769727, + "time": 32.952547550201416 + }, + { + "num_hiddens": [ + 100, + 100 + ], + "save_dir": "results_hidden/100_100", + "test_acc": 0.7740259740259741, + "test_ce": 0.6491708717602728, + "train_acc": 0.8494368701837581, + "train_ce": 0.42341537669073953, + "valid_acc": 0.766109785202864, + "valid_ce": 0.7455287196446071, + "time": 30.947799921035767 + } + ] +} \ No newline at end of file diff --git a/layer of nn.png b/layer of nn.png new file mode 100644 index 0000000..4dd90ae Binary files /dev/null and b/layer of nn.png differ diff --git a/llr_lbs_m0_v_acc.png b/llr_lbs_m0_v_acc.png new file mode 100644 index 0000000..d71d62a Binary files /dev/null and b/llr_lbs_m0_v_acc.png differ diff --git a/llr_lbs_m5_v_acc.png b/llr_lbs_m5_v_acc.png new file mode 100644 index 0000000..95d6fab Binary files /dev/null and b/llr_lbs_m5_v_acc.png differ diff --git a/llr_lbs_m9_v_acc.png b/llr_lbs_m9_v_acc.png new file mode 100644 index 0000000..07f08bf Binary files /dev/null and b/llr_lbs_m9_v_acc.png differ diff --git a/lr0.5plot.png b/lr0.5plot.png new file mode 100644 index 0000000..719b7be Binary files /dev/null and b/lr0.5plot.png differ diff --git a/lr1plot.png b/lr1plot.png new file mode 100644 index 0000000..1916858 Binary files /dev/null and b/lr1plot.png differ diff --git a/ml_hw3.ipynb b/ml_hw3.ipynb new file mode 100644 index 0000000..d9f5570 --- /dev/null +++ b/ml_hw3.ipynb @@ -0,0 +1,11896 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 기계학습 - 2022년 2학기" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 과제2. 다중계층 신경망을 이용한 얼굴 표정 분류기 작성\n", + "\n", + "과제 문의: 전북대학교 컴퓨터공학부 시각 및 학습 연구실 (공과대학 7호관 7619)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Requirements\n", + "- Python >= 3.6\n", + "- numpy\n", + "- matplotlib\n", + "- jupyterplot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "이번 과제에서는 사람 얼굴의 표정 데이터셋(Toronto Faces Dataset, [TFD](http://aclab.ca/users/josh/TFD.html))을 분류하는 다중계층 신경망(Multi-Layer Neural Net)을 구현하고 테스트 합니다." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "\n", + "import importlib.util\n", + "try:\n", + " importlib.util.find_spec('jupyterplot')\n", + "except ImportError:\n", + " %pip install jupyterplot\n", + " pass\n", + "\n", + "from jupyterplot import ProgressPlot\n", + "\n", + "try:\n", + " importlib.util.find_spec('tqdm')\n", + "except ImportError:\n", + " %pip install tqdm\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Toronto Faces Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "TFD는 1-Anger, 2-Disgust, 3-Fear, 4-Happy, 5-Sad, 6-Suprise, 7-Neutral의 총 7개의 클래스를 가진 데이터셋입니다.\n", + "\n", + "데이터셋은 학습, 검증, 테스트(training, validation, test)를 위해서 각각 3374, 419, 385장의 48 $\\times$ 48 크기 grayscale 이미지를 제공합니다.\n", + "\n", + "데이터셋의 예시를 확인하기 위해 아래 셀들을 실행해보시기 바랍니다." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#### Please DO NOT DELETE this cell. ###\n", + "\n", + "def LoadData(fname):\n", + " \"\"\"Loads data from an NPZ file.\n", + "\n", + " Args:\n", + " fname: NPZ filename.\n", + "\n", + " Returns:\n", + " data: Tuple {inputs, target}_{train, valid, test}.\n", + " Row-major, outer axis to be the number of observations.\n", + " \"\"\"\n", + " npzfile = np.load(fname)\n", + "\n", + " inputs_train = npzfile['inputs_train'].T / 255.0\n", + " inputs_valid = npzfile['inputs_valid'].T / 255.0\n", + " inputs_test = npzfile['inputs_test'].T / 255.0\n", + " target_train = npzfile['target_train'].tolist()\n", + " target_valid = npzfile['target_valid'].tolist()\n", + " target_test = npzfile['target_test'].tolist()\n", + "\n", + " num_class = max(target_train + target_valid + target_test) + 1\n", + " target_train_1hot = np.zeros([num_class, len(target_train)])\n", + " target_valid_1hot = np.zeros([num_class, len(target_valid)])\n", + " target_test_1hot = np.zeros([num_class, len(target_test)])\n", + "\n", + " for ii, xx in enumerate(target_train):\n", + " target_train_1hot[xx, ii] = 1.0\n", + "\n", + " for ii, xx in enumerate(target_valid):\n", + " target_valid_1hot[xx, ii] = 1.0\n", + "\n", + " for ii, xx in enumerate(target_test):\n", + " target_test_1hot[xx, ii] = 1.0\n", + "\n", + " inputs_train = inputs_train.T\n", + " inputs_valid = inputs_valid.T\n", + " inputs_test = inputs_test.T\n", + " target_train_1hot = target_train_1hot.T\n", + " target_valid_1hot = target_valid_1hot.T\n", + " target_test_1hot = target_test_1hot.T\n", + " return inputs_train, inputs_valid, inputs_test, target_train_1hot, target_valid_1hot, target_test_1hot\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training dataset\n", + "inputs: (3374, 2304) targets: (3374, 7)\n", + "validation dataset\n", + "inputs: (419, 2304) targets: (419, 7)\n", + "test dataset\n", + "inputs: (385, 2304) targets: (385, 7)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = LoadData('./toronto_face.npz')\n", + "inputs = data[:3]\n", + "targets = data[3:]\n", + "inputs = {k:v for k, v in zip(['train', 'valid', 'test'], inputs)}\n", + "targets = {k:v for k, v in zip(['train', 'valid', 'test'], targets)}\n", + "\n", + "print('training dataset')\n", + "print('inputs:', inputs['train'].shape, 'targets:', targets['train'].shape)\n", + "print('validation dataset')\n", + "print('inputs:', inputs['valid'].shape, 'targets:', targets['valid'].shape)\n", + "print('test dataset')\n", + "print('inputs:', inputs['test'].shape, 'targets:', targets['test'].shape)\n", + "\n", + "classes = ['anger', 'disgust', 'fear', 'happy', 'sad', 'suprise', 'neutral']\n", + "_, labels = np.nonzero(targets['train'])\n", + "\n", + "figs, axes = plt.subplots(nrows=1, ncols=7, figsize=(14,7))\n", + "for idx in range(7):\n", + " axis = axes[idx]\n", + " rnd_idx = np.random.choice(np.nonzero(labels == idx)[0])\n", + " axis.axis('off')\n", + " axis.imshow(inputs['train'][rnd_idx].reshape(48, 48), cmap='gray')\n", + " axis.set_title('{}: {}'.format(rnd_idx, classes[idx]))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training Multi-layer Neural Networks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "1. 기본적인 일반화 (basic generalization): 코드에 주어진 hyperparameter 들을 이용하여 신경망을 학습시킨다. 학습 오차(training error)와 일반화를 위한 검증 오차(validation error) 결과가 어떻게 다른지 설명한다. 두 가지 경우(학습과 일반화 검증)에 대해 오차 커브(error curve)를 그래프로 제시하시오." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. 최적화 (optimization): Learning rate, momentum, mini-batch size 세 가지 종류의 parameter 들을 아래와 같이 변화시키면서 다양한 조합들에 대해 신경망이 cross-entropy 관점에서 어떻게 수렴하는지 살펴본다. 가장 우수한 성능을 나타내는 hyperparameter 들의 조합이 어떤 것인지 제시하시오. (모든 경우의 수를 다 따지면 75 가지 신경망 모델을 테스트해야 하나 시간이 너무 많이 결릴 수 있으므로 이 중에서 일부분의 경우들만 테스트해도 된다. 그러나 어떤 근거로 해당 조합들만 테스트했는지 적당한 설명이 있어야 함.)\n", + " - Learning rate ( $\\epsilon$ ): 0.001 에서 1.0 사이의 5 가지 경우\n", + " - Momentum: 0.0 에서 0.9 사이의 3 가지 경우\n", + " - Mini-batch size: 1 에서 1000 까지의 5 가지 경우" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. 신경망 모델 구조 변경: Momentum 을 0.9로 고정시킨 상태에서 신경망의 hidden unit 들의 갯수를 2 에서 100 사이의 3 가지 다른 경우에 대해 성능을 비교한다. 필요한 경우 learning rate 와 학습 기간(epochs)은 신경망 구조에 따라 적당하게 변경할 수 있다. Hidden unit 의 갯수들이 학습에서의 수렴과 신경망의 일반화 성는에 미치는 영향에 대한 데이터(표나 그래프)를 제시하고 경향을 분석하시오." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Method and Class Definitions for Neural Networks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Utility methods" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def Save(fname: str, data):\n", + " \"\"\"Saves the model to a numpy file.\"\"\"\n", + " print('Writing to ' + fname)\n", + " np.savez_compressed(fname, **data)\n", + "\n", + "\n", + "def Load(fname: str):\n", + " \"\"\"Loads model from numpy file.\"\"\"\n", + " print('Loading from ' + fname)\n", + " return dict(np.load(fname))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Utility Classes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass, fields, asdict\n", + "from os import PathLike\n", + "from typing import List, Tuple, Dict, Any, Union, Optional, TextIO\n", + "import json\n", + "\n", + "\n", + "class BaseDataclass:\n", + " def to_dict(self):\n", + " return asdict(self)\n", + "\n", + " def to_json(self, fp: Union[str, PathLike, TextIO]):\n", + " json.dump(self.to_dict(), fp, indent=2)\n", + "\n", + " @classmethod\n", + " def from_dict(cls, d: Dict[str, Any]):\n", + " return cls(**d)\n", + "\n", + " @classmethod\n", + " def from_json_stream(cls, fp: TextIO):\n", + " return cls.from_dict(json.load(fp))\n", + " \n", + " @classmethod\n", + " def load_from_json(cls, fp_or_name: Union[str, PathLike, TextIO]):\n", + " if isinstance(fp_or_name, str) or isinstance(fp_or_name, PathLike):\n", + " with open(fp_or_name, 'r') as fp:\n", + " return cls.from_json_stream(fp)\n", + " else:\n", + " return cls.from_json_stream(fp_or_name)\n", + " \n", + " def save_json(self, fp_or_name: Union[str, PathLike, TextIO]):\n", + " if isinstance(fp_or_name, str) or isinstance(fp_or_name, PathLike):\n", + " with open(fp_or_name, 'w') as fp:\n", + " self.to_json(fp)\n", + " else:\n", + " self.to_json(fp_or_name)\n", + " \n", + " def keys(self):\n", + " return [f.name for f in fields(self)]\n", + " \n", + " def values(self):\n", + " return [getattr(self, f.name) for f in fields(self)]\n", + " \n", + " def items(self):\n", + " return [(f.name, getattr(self, f.name)) for f in fields(self)]\n", + " \n", + " def copy(self):\n", + " return self.from_dict(self.to_dict())\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, value):\n", + " return setattr(self, key, value)\n", + "\n", + " def __iter__(self):\n", + " return iter(self.keys())\n", + "\n", + "@dataclass\n", + "class Config(BaseDataclass):\n", + " \"\"\"Configuration for the neural network.\"\"\"\n", + " num_inputs: int = 2304\n", + " num_hiddens: Tuple[int,int] = (16, 8)\n", + " num_outputs: int = 7\n", + " eps: float = 1e-3\n", + " momentum: float = 0.9\n", + " num_epochs: int = 100\n", + " batch_size: int = 128\n", + " early_stopping: bool = True\n", + " patience: int = 10\n", + "\n", + "@dataclass\n", + "class ModelWeights(BaseDataclass):\n", + " \"\"\"Model for the neural network.\"\"\"\n", + " W1: np.ndarray\n", + " b1: np.ndarray\n", + " W2: np.ndarray\n", + " b2: np.ndarray\n", + " W3: np.ndarray\n", + " b3: np.ndarray\n", + "\n", + " \n", + " def to_json(cls, fp: Union[str, PathLike, TextIO]):\n", + " raise NotImplementedError('Cannot save model weights to JSON.')\n", + " \n", + " def save_json(cls, fp_or_name: Union[str, PathLike, TextIO]):\n", + " raise NotImplementedError('Cannot save model weights to JSON.')\n", + "\n", + " @classmethod\n", + " def from_json_stream(cls, fp: TextIO):\n", + " raise NotImplementedError('Cannot load model weights from JSON.')\n", + "\n", + " @classmethod\n", + " def load_from_json(cls, fp_or_name: Union[str, PathLike, TextIO]):\n", + " raise NotImplementedError('Cannot load model weights from JSON.')\n", + " \n", + " def copy(self):\n", + " return ModelWeights(\n", + " W1=self.W1.copy(),\n", + " b1=self.b1.copy(),\n", + " W2=self.W2.copy(),\n", + " b2=self.b2.copy(),\n", + " W3=self.W3.copy(),\n", + " b3=self.b3.copy(),\n", + " )\n", + "\n", + " def save(self, fp: Union[str, PathLike, TextIO]):\n", + " \"\"\"Saves the model to a numpy file.\"\"\"\n", + " np.savez_compressed(fp, **asdict(self))\n", + "\n", + " @classmethod\n", + " def load(cls, fp: Union[str, PathLike, TextIO]):\n", + " \"\"\"Loads model from numpy file.\"\"\"\n", + " # Since the numpy version after 1.16.2, In response to CVE-2019-6446(https://nvd.nist.gov/vuln/detail/CVE-2019-6446),\n", + " # np.savez_compressed allow_pickle=False by default.\n", + " # In 1.16.2 and earlier, Arbitrary code execution can be performed by loading a maliciously crafted .npy file.\n", + " # So, I set allow_pickle=False to prevent this vulnerability.\n", + " data = dict(np.load(fp, allow_pickle=False))\n", + " \n", + " return cls(**data)\n", + "\n", + "@dataclass\n", + "class Statistic(BaseDataclass):\n", + " \"\"\"Statistics for the neural network.\"\"\"\n", + " train_ce: List[Tuple[int, float]]\n", + " valid_ce: List[Tuple[int, float]]\n", + " train_acc: List[Tuple[int, float]]\n", + " valid_acc: List[Tuple[int, float]]\n", + " test_ce: float\n", + " test_acc: float\n", + "\n", + " def keys(self):\n", + " return [f.name for f in fields(self)]\n", + " \n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def best_valid_acc(self):\n", + " return max(self.valid_acc, key=lambda x: x[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " import io\n", + " # Test the dataclass\n", + " # Config\n", + " config = Config(2304, (100, 50), 7, 0.01, 0.9, 100, 100)\n", + " fp = io.StringIO()\n", + " config.save_json(fp)\n", + " fp.seek(0)\n", + " config = Config.load_from_json(fp)\n", + " print(config)\n", + "\n", + " # ModelWeights\n", + " model = ModelWeights(np.random.randn(2304, 100), np.random.randn(100), np.random.randn(100, 50), np.random.randn(50), np.random.randn(50, 7), np.random.randn(7))\n", + " fp = io.BytesIO()\n", + " model.save(fp)\n", + " fp.seek(0)\n", + " model = ModelWeights.load(fp)\n", + " print(model.keys())\n", + " \n", + " # Statistic\n", + " stat = Statistic([(1, 0.1), (2, 0.2)], [(1, 0.3), (2, 0.4)], [(1, 0.5), (2, 0.6)], [(1, 0.7), (2, 0.8)], 0.9, 1.0)\n", + " fp = io.StringIO()\n", + " stat.save_json(fp)\n", + " fp.seek(0)\n", + " stat = Statistic.load_from_json(fp)\n", + " print(stat.keys())\n", + " print(stat.best_valid_acc())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Neural Networks\n", + "아래는 neural networks 의 초기화 및 forward pass를 구현한 코드 입니다." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def Affine(x: np.ndarray, w: np.ndarray, b: np.ndarray) -> np.ndarray:\n", + " \"\"\"Computes the affine transformation.\n", + "\n", + " Args:\n", + " x: Inputs\n", + " w: Weights\n", + " b: Bias\n", + "\n", + " Returns:\n", + " y: Outputs\n", + " \"\"\"\n", + " # y = np.dot(w.T, x) + b\n", + " y = x.dot(w) + b\n", + " return y\n", + "\n", + "def ReLU(x: np.ndarray) -> np.ndarray:\n", + " \"\"\"Computes the ReLU activation function.\n", + "\n", + " Args:\n", + " x: Inputs\n", + "\n", + " Returns:\n", + " y: Activation\n", + " \"\"\"\n", + " return np.maximum(x, 0.0)\n", + "\n", + "def Softmax(x: np.ndarray) -> np.ndarray:\n", + " \"\"\"Computes the softmax activation function.\n", + "\n", + " Args:\n", + " x: Inputs\n", + "\n", + " Returns:\n", + " y: Activation\n", + " \"\"\"\n", + " x -= np.max(x, axis=1, keepdims=True)\n", + " return np.exp(x) / np.exp(x).sum(axis=1, keepdims=True)\n", + "\n", + "def InitMLP(num_inputs: int, num_hiddens: Tuple[int, int], num_outputs: int):\n", + " \"\"\"Initializes NN parameters.\n", + "\n", + " Args:\n", + " num_inputs: Number of input units.\n", + " num_hiddens: List of two elements, hidden size for each layer.\n", + " num_outputs: Number of output units.\n", + "\n", + " Returns:\n", + " model: Randomly initialized network weights.\n", + " \"\"\"\n", + " W1 = 0.1 * np.random.randn(num_inputs, num_hiddens[0])\n", + " W2 = 0.1 * np.random.randn(num_hiddens[0], num_hiddens[1])\n", + " W3 = 0.01 * np.random.randn(num_hiddens[1], num_outputs)\n", + " b1 = np.zeros((num_hiddens[0]))\n", + " b2 = np.zeros((num_hiddens[1]))\n", + " b3 = np.zeros((num_outputs))\n", + " model = ModelWeights(W1, b1, W2, b2, W3, b3)\n", + " return model\n", + "\n", + "def NNForward(model: ModelWeights, x: np.ndarray) -> Dict[str, np.ndarray]:\n", + " \"\"\"Runs the forward pass.\n", + "\n", + " Args:\n", + " model: Dictionary of all the weights.\n", + " x: Input to the network.\n", + "\n", + " Returns:\n", + " var: Dictionary of all intermediate variables.\n", + " \"\"\"\n", + " h1 = Affine(x, model.W1, model.b1)\n", + " h1r = ReLU(h1)\n", + " h2 = Affine(h1r, model.W2, model.b2)\n", + " h2r = ReLU(h2)\n", + " y = Affine(h2r, model.W3, model.b3)\n", + " var = {\n", + " 'x': x,\n", + " 'h1': h1,\n", + " 'h1r': h1r,\n", + " 'h2': h2,\n", + " 'h2r': h2r,\n", + " 'y': y\n", + " }\n", + " return var" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "아래는 neural networks 의 backward 구현하기 위한 코드들입니다.\n", + "아래 세 부분을 채워 코드를 완성시키기 바랍니다.\n", + "\n", + "1. Affine layer 의 backward pass equations (linear trainsformation + bias).\n", + "2. RELU activation function 의 backward pass equations.\n", + "3. Momentum 이 포함된 weight update equations." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def AffineBackward(grad_y: np.ndarray, x: np.ndarray, w: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:\n", + " \"\"\"Computes gradients of affine transformation.\n", + "\n", + " Args:\n", + " grad_y: gradient from last layer\n", + " x: inputs\n", + " w: weights\n", + "\n", + " Returns:\n", + " grad_x: Gradients wrt. the inputs.\n", + " grad_w: Gradients wrt. the weights.\n", + " grad_b: Gradients wrt. the biases.\n", + " \"\"\"\n", + " grad_x = grad_y.dot(w.T)\n", + " grad_w = x.T.dot(grad_y)\n", + " grad_b = np.sum(grad_y, axis=0)\n", + " return grad_x, grad_w, grad_b" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def ReLUBackward(grad_y: np.ndarray, x: np.ndarray, y: np.ndarray) -> np.ndarray:\n", + " \"\"\"Computes gradients of the ReLU activation function.\n", + "\n", + " Returns:\n", + " grad_x: Gradients wrt. the inputs.\n", + " \"\"\"\n", + " grad_x = grad_y * (x > 0)\n", + " return grad_x" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def NNBackward(model: ModelWeights, err: np.ndarray, var: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", + " \"\"\"Runs the backward pass.\n", + "\n", + " Args:\n", + " model: Dictionary of all the weights.\n", + " err: Gradients to the output of the network.\n", + " var: Intermediate variables from the forward pass.\n", + " Returns:\n", + " grads: Gradients to all the weights.\n", + " \"\"\"\n", + " dE_dh2r, dE_dW3, dE_db3 = AffineBackward(err, var['h2r'], model['W3'])\n", + " dE_dh2 = ReLUBackward(dE_dh2r, var['h2'], var['h2r'])\n", + " dE_dh1r, dE_dW2, dE_db2 = AffineBackward(dE_dh2, var['h1r'], model['W2'])\n", + " dE_dh1 = ReLUBackward(dE_dh1r, var['h1'], var['h1r'])\n", + " _, dE_dW1, dE_db1 = AffineBackward(dE_dh1, var['x'], model['W1'])\n", + "\n", + " grads = {}\n", + " grads['W1'] = dE_dW1\n", + " grads['W2'] = dE_dW2\n", + " grads['W3'] = dE_dW3\n", + " grads['b1'] = dE_db1\n", + " grads['b2'] = dE_db2\n", + " grads['b3'] = dE_db3\n", + " return grads" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def InitMomentumState(model: ModelWeights) -> Dict[str, np.ndarray]:\n", + " \"\"\"Initializes momentums for all the weights.\n", + "\n", + " Args:\n", + " model: Dictionary of all the weights.\n", + "\n", + " Returns:\n", + " momentums: Dictionary of all the momentums.\n", + " \"\"\"\n", + " momentums = {}\n", + " for key in model.keys():\n", + " momentums[key] = np.zeros_like(model[key])\n", + " return momentums\n", + "\n", + "def NNUpdate(model: ModelWeights, eps: float, momentum: float, optimizer_state: Dict[str, np.ndarray], grads: Dict[str, np.ndarray]):\n", + " \"\"\"Update NN weights.\n", + "\n", + " Args:\n", + " model: Dictionary of all the weights.\n", + " eps: Learning rate.\n", + " momentum: Momentum.\n", + " optimizer_state: State of the optimizer.\n", + " tape: Gradients to all the weights.\n", + " \"\"\"\n", + " for key in model:\n", + " # Momentum update\n", + " # optimizer state is the velocity\n", + " optimizer_state[key] = momentum * optimizer_state[key] - eps * grads[key]\n", + " model[key] += optimizer_state[key]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 훈련" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def Train(model, forward, backward, update, eps, momentum, num_epochs,\n", + " batch_size):\n", + " \"\"\"Trains a simple MLP.\n", + "\n", + " Args:\n", + " model: Dictionary of model weights.\n", + " forward: Forward prop function.\n", + " backward: Backward prop function.\n", + " update: Update weights function.\n", + " eps: Learning rate.\n", + " momentum: Momentum.\n", + " num_epochs: Number of epochs to run training for.\n", + " batch_size: Mini-batch size, -1 for full batch.\n", + "\n", + " Returns:\n", + " stats: Dictionary of training statistics.\n", + " - train_ce: Training cross entropy.\n", + " - valid_ce: Validation cross entropy.\n", + " - train_acc: Training accuracy.\n", + " - valid_acc: Validation accuracy.\n", + " \"\"\"\n", + " inputs_train, inputs_valid, inputs_test, target_train, target_valid, \\\n", + " target_test = LoadData('./toronto_face.npz')\n", + " rnd_idx = np.arange(inputs_train.shape[0])\n", + " train_ce_list = []\n", + " valid_ce_list = []\n", + " train_acc_list = []\n", + " valid_acc_list = []\n", + " \n", + " num_train_cases = inputs_train.shape[0]\n", + " if batch_size == -1:\n", + " batch_size = num_train_cases\n", + " num_steps = int(np.ceil(num_train_cases / batch_size))\n", + "\n", + " pp = ProgressPlot(\n", + " plot_names=['Cross entropy', 'Accuracy'],\n", + " line_names=['Train', 'Validation'],\n", + " x_label='Iteration',\n", + " x_lim=[0, num_epochs*num_steps]\n", + " )\n", + " optimizer_state = InitMomentumState(model)\n", + "\n", + " valid_ce = 0\n", + " valid_acc = 0\n", + " for epoch in range(num_epochs):\n", + " np.random.shuffle(rnd_idx)\n", + " inputs_train = inputs_train[rnd_idx]\n", + " target_train = target_train[rnd_idx]\n", + " for step in range(num_steps):\n", + " # Forward prop.\n", + " start = step * batch_size\n", + " end = min(num_train_cases, (step + 1) * batch_size)\n", + " x = inputs_train[start: end]\n", + " t = target_train[start: end]\n", + "\n", + " var = forward(model, x)\n", + " prediction = Softmax(var['y'])\n", + "\n", + " train_ce = -np.sum(t * np.log(prediction)) / x.shape[0]\n", + " train_acc = (np.argmax(prediction, axis=1) ==\n", + " np.argmax(t, axis=1)).astype('float').mean()\n", + " pp.update([[train_ce, valid_ce], [train_acc, valid_acc]])\n", + "\n", + " # Compute error.\n", + " error = (prediction - t) / x.shape[0]\n", + "\n", + " # Backward prop.\n", + " grads = backward(model, error, var)\n", + "\n", + " # Update weights.\n", + " update(model, eps, momentum, optimizer_state, grads)\n", + "\n", + " valid_ce, valid_acc = Evaluate(\n", + " inputs_valid, target_valid, model, forward, batch_size=batch_size)\n", + " \n", + " pp.update([[train_ce, valid_ce], [train_acc, valid_acc]])\n", + " train_ce_list.append((epoch, train_ce))\n", + " train_acc_list.append((epoch, train_acc))\n", + " valid_ce_list.append((epoch, valid_ce))\n", + " valid_acc_list.append((epoch, valid_acc))\n", + "\n", + " # print()\n", + " train_ce, train_acc = Evaluate(\n", + " inputs_train, target_train, model, forward, batch_size=batch_size)\n", + " valid_ce, valid_acc = Evaluate(\n", + " inputs_valid, target_valid, model, forward, batch_size=batch_size)\n", + " test_ce, test_acc = Evaluate(\n", + " inputs_test, target_test, model, forward, batch_size=batch_size)\n", + " print('CE: Train %.5f Validation %.5f Test %.5f' %\n", + " (train_ce, valid_ce, test_ce))\n", + " print('Acc: Train {:.5f} Validation {:.5f} Test {:.5f}'.format(\n", + " train_acc, valid_acc, test_acc))\n", + " pp.finalize()\n", + " stats = {\n", + " 'train_ce': train_ce_list,\n", + " 'valid_ce': valid_ce_list,\n", + " 'train_acc': train_acc_list,\n", + " 'valid_acc': valid_acc_list\n", + " }\n", + "\n", + " return model, stats\n", + "\n", + "def Evaluate(inputs, target, model, forward, batch_size=-1):\n", + " \"\"\"Evaluates the model on inputs and target.\n", + "\n", + " Args:\n", + " inputs: Inputs to the network.\n", + " target: Target of the inputs.\n", + " model: Dictionary of network weights.\n", + " \"\"\"\n", + " num_cases = inputs.shape[0]\n", + " if batch_size == -1:\n", + " batch_size = num_cases\n", + " num_steps = int(np.ceil(num_cases / batch_size))\n", + " ce = 0.0\n", + " acc = 0.0\n", + " for step in range(num_steps):\n", + " start = step * batch_size\n", + " end = min(num_cases, (step + 1) * batch_size)\n", + " x = inputs[start: end]\n", + " t = target[start: end]\n", + " prediction = Softmax(forward(model, x)['y'])\n", + " ce += -np.sum(t * np.log(prediction))\n", + " acc += (np.argmax(prediction, axis=1) == np.argmax(\n", + " t, axis=1)).astype('float').sum()\n", + " ce /= num_cases\n", + " acc /= num_cases\n", + " return ce, acc" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def CheckGrad(model, forward, backward, name, x):\n", + " \"\"\"Check the gradients\n", + "\n", + " Args:\n", + " model: Dictionary of network weights.\n", + " name: Weights name to check.\n", + " x: Fake input.\n", + " \"\"\"\n", + " np.random.seed(0)\n", + " var = forward(model, x)\n", + " loss = lambda y: 0.5 * (y ** 2).sum()\n", + " grad_y = var['y']\n", + " grads = backward(model, grad_y, var)\n", + " grad_w = grads[name].ravel()\n", + " w_ = model[name].ravel()\n", + " eps = 1e-7\n", + " grad_w_2 = np.zeros(w_.shape)\n", + " check_elem = np.arange(w_.size)\n", + " np.random.shuffle(check_elem)\n", + " # Randomly check 20 elements.\n", + " check_elem = check_elem[:20]\n", + " for ii in check_elem:\n", + " w_[ii] += eps\n", + " err_plus = loss(forward(model, x)['y'])\n", + " w_[ii] -= 2 * eps\n", + " err_minus = loss(forward(model, x)['y'])\n", + " w_[ii] += eps\n", + " grad_w_2[ii] = (err_plus - err_minus) / 2 / eps\n", + " np.testing.assert_almost_equal(grad_w[check_elem], grad_w_2[check_elem],\n", + " decimal=3)\n", + "\n", + "\n", + "def main():\n", + " \"\"\"Trains a NN.\"\"\"\n", + " model_fname = 'nn_model.npz'\n", + " stats_fname = 'nn_stats.npz'\n", + "\n", + " # Hyper-parameters. Modify them if needed.\n", + " num_hiddens = [16, 32]\n", + " eps = 0.01\n", + " momentum = 0.0\n", + " num_epochs = 1000\n", + " batch_size = 100\n", + "\n", + " # Input-output dimensions.\n", + " num_inputs = 2304\n", + " num_outputs = 7\n", + "\n", + " # Initialize model.\n", + " model = InitMLP(num_inputs, num_hiddens, num_outputs)\n", + "\n", + " # Uncomment to reload trained model here.\n", + " # model = Load(model_fname)\n", + "\n", + " # Check gradient implementation.\n", + " print('Checking gradients...')\n", + " x = np.random.rand(10, 48 * 48) * 0.1\n", + " CheckGrad(model, NNForward, NNBackward, 'W3', x)\n", + " CheckGrad(model, NNForward, NNBackward, 'b3', x)\n", + " CheckGrad(model, NNForward, NNBackward, 'W2', x)\n", + " CheckGrad(model, NNForward, NNBackward, 'b2', x)\n", + " CheckGrad(model, NNForward, NNBackward, 'W1', x)\n", + " CheckGrad(model, NNForward, NNBackward, 'b1', x)\n", + " print('Done.')\n", + " # Train model.\n", + " print('training...')\n", + " trained_model, stats = Train(model, NNForward, NNBackward, NNUpdate, eps,\n", + " momentum, num_epochs, batch_size)\n", + "\n", + " plt.figure(0)\n", + " plt.plot(np.array(stats['train_ce'])[:, 0], np.array(stats['train_ce'])[:, 1], 'b', label='Train')\n", + " plt.plot(np.array(stats['valid_ce'])[:, 0], np.array(stats['valid_ce'])[:, 1], 'orange', label='Validation')\n", + " plt.xlabel('Epoch')\n", + " plt.ylabel('Cross Entropy')\n", + " plt.legend()\n", + "\n", + " plt.figure(1)\n", + " plt.plot(np.array(stats['train_acc'])[:, 0], np.array(stats['train_acc'])[:, 1], 'b', label='Train')\n", + " plt.plot(np.array(stats['valid_acc'])[:, 0], np.array(stats['valid_acc'])[:, 1], 'orange', label='Validation')\n", + " plt.xlabel('Epoch')\n", + " plt.ylabel('Accuracy')\n", + " plt.legend()\n", + " plt.show()\n", + " # Uncomment if you wish to save the model.\n", + " Save(model_fname, model)\n", + "\n", + " # Uncomment if you wish to save the training statistics.\n", + " Save(stats_fname, stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Early Stopping 이 적용된 훈련" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "사양이 좋지 않은 컴퓨터에서 `ProgressPlot` 이 항목 수가 많아지면서(약 10000부터) 더 이상 그래프가 제대로 그리지 못하고 느려지는 있습니다. 이 문제는 `ProgressPlot`이 그래프를 그리는 것이 O(N)의 복잡도를 가져서 그렇습니다. 이를 해결하기 위해 `ProgressPlot`에서 `Tqdm` 으로 변경하였습니다. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "def TrainAdvanced(model: ModelWeights, \n", + " forward=NNForward,\n", + " backward=NNBackward,\n", + " update=NNUpdate,\n", + " eps = 0.01,\n", + " momentum = 0.0,\n", + " num_epochs = 1000,\n", + " batch_size = 100,\n", + " early_stopping: bool = True,\n", + " patience: int = 10,\n", + " verbose: bool = True,\n", + " tqdm_leave: bool = True,\n", + " pplot: bool = False,\n", + " ) -> Tuple[ModelWeights, Statistic]:\n", + " \"\"\"Trains a simple MLP.\n", + "\n", + " Args:\n", + " model: Dictionary of model weights.\n", + " forward: Forward prop function.\n", + " backward: Backward prop function.\n", + " update: Update weights function.\n", + " eps: Learning rate.\n", + " momentum: Momentum.\n", + " num_epochs: Number of epochs to run training for.\n", + " batch_size: Mini-batch size, -1 for full batch.\n", + " early_stopping: Whether to use early stopping.\n", + " patience: Number of epochs to wait before early stopping.\n", + " verbose: Whether to print training statistics.\n", + " tqdm_leave: Whether to leave tqdm progress bar.\n", + " pplot: Whether to plot training statistics.\n", + "\n", + " Returns:\n", + " model: Trained model.\n", + " stats: Dictionary of training statistics.\n", + " - train_ce: Training cross entropy.\n", + " - valid_ce: Validation cross entropy.\n", + " - train_acc: Training accuracy.\n", + " - valid_acc: Validation accuracy.\n", + " \"\"\"\n", + " # load data\n", + " inputs_train, inputs_valid, inputs_test, target_train, target_valid, \\\n", + " target_test = LoadData('./toronto_face.npz')\n", + " \n", + " rnd_idx = np.arange(inputs_train.shape[0])\n", + " train_ce_list = []\n", + " valid_ce_list = []\n", + " train_acc_list = []\n", + " valid_acc_list = []\n", + " \n", + " num_train_cases = inputs_train.shape[0]\n", + " if batch_size == -1 or batch_size > num_train_cases or batch_size == 0:\n", + " batch_size = num_train_cases\n", + " num_steps = int(np.ceil(num_train_cases / batch_size))\n", + "\n", + " try:\n", + " if pplot:\n", + " # initialize plot\n", + " pp = ProgressPlot(\n", + " plot_names=['Cross entropy', 'Accuracy'],\n", + " line_names=['Train', 'Validation'],\n", + " x_label='Iteration',\n", + " x_lim=[0, num_epochs]\n", + " )\n", + " pbar = range(num_epochs)\n", + " else:\n", + " # Tqdm progress bar.\n", + " pbar = tqdm(range(num_epochs), disable=not verbose or num_epochs == 1, leave=tqdm_leave)\n", + "\n", + " # Initialize optimizer state\n", + " optimizer_state = InitMomentumState(model)\n", + "\n", + " # Initialize stats.\n", + " valid_ce = 0\n", + " valid_acc = 0\n", + "\n", + " # Early stopping\n", + " best_valid_ce = np.inf\n", + " best_valid_acc = 0\n", + " best_epoch = 0\n", + " best_model = None\n", + "\n", + " epsilon = np.finfo(float).eps\n", + "\n", + " for epoch in pbar:\n", + " np.random.shuffle(rnd_idx)\n", + " inputs_train = inputs_train[rnd_idx]\n", + " target_train = target_train[rnd_idx]\n", + "\n", + " train_ce = 0\n", + " train_acc = 0\n", + " for step in range(num_steps):\n", + " # Get mini-batch.\n", + " start = step * batch_size\n", + " # min is used to handle the case when batch_size does not divide num_train_cases\n", + " end = min(num_train_cases, (step + 1) * batch_size)\n", + "\n", + " input_batch = inputs_train[start: end]\n", + " target_batch = target_train[start: end]\n", + "\n", + " # Forward prop.\n", + " var = forward(model, input_batch)\n", + " prediction = Softmax(var['y'])\n", + "\n", + " # Compute loss.\n", + " train_ce += -np.sum(target_batch * np.log(prediction + epsilon)) / input_batch.shape[0]\n", + " train_acc += (np.argmax(prediction, axis=1) ==\n", + " np.argmax(target_batch, axis=1)).astype('float').sum()\n", + "\n", + " # Compute error.\n", + " error = (prediction - target_batch) / input_batch.shape[0]\n", + "\n", + " # Backward prop.\n", + " grads = backward(model, error, var)\n", + "\n", + " # Update weights.\n", + " update(model, eps, momentum, optimizer_state, grads)\n", + "\n", + " train_ce /= num_steps\n", + " train_acc /= num_train_cases\n", + "\n", + " # Compute validation error.\n", + " valid_ce, valid_acc = Evaluate(\n", + " inputs_valid, target_valid, model, forward, batch_size=batch_size)\n", + "\n", + " train_ce_list.append((epoch, train_ce))\n", + " train_acc_list.append((epoch, train_acc))\n", + " valid_ce_list.append((epoch, valid_ce))\n", + " valid_acc_list.append((epoch, valid_acc))\n", + "\n", + " if pplot:\n", + " # Update plot.\n", + " pp.update([[train_ce, valid_ce], [train_acc, valid_acc]])\n", + " else:\n", + " # Tqdm progress bar.\n", + " pbar.set_description(f\"Train CE: {train_ce:.4f}, Valid CE: {valid_ce:.4f}, Train Acc: {train_acc:.4f}, Valid Acc: {valid_acc:.4f}\")\n", + "\n", + " # Early stopping.\n", + " if valid_ce < best_valid_ce:\n", + " best_valid_ce = valid_ce\n", + " best_valid_acc = valid_acc\n", + " best_epoch = epoch\n", + " best_model = model.copy()\n", + " elif early_stopping and epoch - best_epoch >= patience:\n", + " model = best_model\n", + " break\n", + " \n", + " test_ce, test_acc = Evaluate(\n", + " inputs_test, target_test, model, forward, batch_size=batch_size)\n", + "\n", + " stats = Statistic(train_ce_list, valid_ce_list, train_acc_list, valid_acc_list,\n", + " test_ce=test_ce, test_acc=test_acc)\n", + " finally:\n", + " if not pplot:\n", + " pbar.close()\n", + " else:\n", + " pp.finalize()\n", + " \n", + " return model, stats" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def TrainMLP(conf: Config, pplot: bool = False) -> Tuple[ModelWeights, Statistic]:\n", + " \"\"\"Trains a simple MLP.\n", + "\n", + " Args:\n", + " conf: Configuration.\n", + " pplot: Whether to plot training statistics.\n", + "\n", + " Returns:\n", + " model: Trained model.\n", + " stats: Dictionary of training statistics.\n", + " - train_ce: Training cross entropy list.\n", + " - valid_ce: Validation cross entropy list.\n", + " - train_acc: Training accuracy list.\n", + " - valid_acc: Validation accuracy list.\n", + " - test_ce: Test cross entropy.\n", + " - test_acc: Test accuracy.\n", + " \"\"\"\n", + " # Initialize model.\n", + " model = InitMLP(\n", + " conf.num_inputs, conf.num_hiddens, conf.num_outputs)\n", + "\n", + " # Train model.\n", + " model, stats = TrainAdvanced(\n", + " model,\n", + " eps=conf.eps,\n", + " momentum=conf.momentum,\n", + " num_epochs=conf.num_epochs,\n", + " batch_size=conf.batch_size,\n", + " early_stopping=conf.early_stopping,\n", + " patience=conf.patience,\n", + " verbose=True,\n", + " tqdm_leave=False,\n", + " pplot=pplot)\n", + "\n", + " return model, stats" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def PlotStats(stats: Statistic, title: str = '', save_path: Optional[str] = None, show: bool = True):\n", + " \"\"\"Plots training statistics.\n", + "\n", + " Args:\n", + " stats: Dictionary of training statistics.\n", + " - train_ce: Training cross entropy list.\n", + " - valid_ce: Validation cross entropy list.\n", + " - train_acc: Training accuracy list.\n", + " - valid_acc: Validation accuracy list.\n", + " - test_ce: Test cross entropy.\n", + " - test_acc: Test accuracy.\n", + " title: Plot title.\n", + " \"\"\"\n", + " fig, ax = plt.subplots(1, 2, figsize=(12, 4))\n", + " fig.suptitle(title)\n", + " ax[0].set_title('Cross Entropy')\n", + " ax[0].set_xlabel('Epoch')\n", + " ax[0].set_ylabel('Cross Entropy')\n", + " ax[0].plot(*zip(*stats.train_ce), label='Train')\n", + " ax[0].plot(*zip(*stats.valid_ce), label='Valid')\n", + " ax[0].legend()\n", + " ax[1].set_title('Accuracy')\n", + " ax[1].set_xlabel('Epoch')\n", + " ax[1].set_ylabel('Accuracy')\n", + " ax[1].plot(*zip(*stats.train_acc), label='Train')\n", + " ax[1].plot(*zip(*stats.valid_acc), label='Valid')\n", + " ax[1].legend()\n", + " if save_path is not None:\n", + " plt.savefig(save_path)\n", + " if show:\n", + " plt.show()\n", + " # close the plot\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Test PlotStats.\n", + "if False:\n", + " mock_stats = Statistic(\n", + " train_ce=[(0, 0.5), (1, 0.4), (2, 0.3)],\n", + " valid_ce=[(0, 0.6), (1, 0.5), (2, 0.4)],\n", + " train_acc=[(0, 0.7), (1, 0.8), (2, 0.9)],\n", + " valid_acc=[(0, 0.6), (1, 0.5), (2, 0.4)],\n", + " test_ce=0.3,\n", + " test_acc=0.9,\n", + " )\n", + " PlotStats(mock_stats, title='MLP')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def ExperimentMLP(conf: Config, title: Optional[str] = None, save_dir: Union[str, PathLike] = 'results', show: bool = True, pplot: bool = False):\n", + " \"\"\"Runs a simple MLP experiment.\n", + "\n", + " Args:\n", + " conf: Configuration.\n", + " save_dir: Directory to save results.\n", + " show: Whether to show plots.\n", + " \"\"\"\n", + " if title is None:\n", + " title = f'MLP [{\",\".join([str(s) for s in conf.num_hiddens])}] lr:{conf.eps} m:{conf.momentum} b:{conf.batch_size}'\n", + " # Create save directory.\n", + " os.makedirs(save_dir, exist_ok=True)\n", + "\n", + " # Train model.\n", + " model, stats = TrainMLP(conf, pplot=pplot)\n", + " conf.save_json(os.path.join(save_dir, 'conf.json'))\n", + " model.save(os.path.join(save_dir, 'model.npz'))\n", + " # Plot training statistics.\n", + " PlotStats(stats, title='MLP', save_path=os.path.join(save_dir, 'stats.png'), show=show)\n", + " stats.save_json(os.path.join(save_dir, 'stats.json'))\n", + "\n", + " return model, stats" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " import traceback\n", + " import time\n", + " try:\n", + " begin = time.time()\n", + " # Test ExperimentMLP.\n", + " ExperimentMLP(Config(), save_dir='test_mlp', show=True)\n", + " end = time.time()\n", + " print(f\"Time: {end - begin:.2f} seconds\")\n", + " except TypeError as e:\n", + " print('TypeError: ', e)\n", + " traceback.print_exc()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 테스트" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def load_experiment(path: Union[str, PathLike], load_model = False) -> Tuple[Config ,Statistic, Optional[ModelWeights]]:\n", + " \"\"\"Loads experiment result\n", + "\n", + " Args:\n", + " path: Path to experiment directory.\n", + " load_model: Whether to load model.\n", + "\n", + " Returns:\n", + " conf: Configuration.\n", + " stats: Dictionary of training statistics.\n", + " - train_ce: Training cross entropy list.\n", + " - valid_ce: Validation cross entropy list.\n", + " - train_acc: Training accuracy list.\n", + " - valid_acc: Validation accuracy list.\n", + " - test_ce: Test cross entropy.\n", + " - test_acc: Test accuracy.\n", + " model: Trained model.\n", + " \"\"\"\n", + " stat = Statistic.load_from_json(os.path.join(path, 'stats.json'))\n", + " conf = Config.load_from_json(os.path.join(path, 'conf.json'))\n", + " model = None\n", + " if load_model:\n", + " model = ModelWeights.load(os.path.join(path, 'model.npz'))\n", + " return conf, stat, model" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " if not os.path.exists('test_mlp'):\n", + " ExperimentMLP(Config(), save_dir='test_mlp', show=True)\n", + " conf, stats, model = load_experiment('test_mlp', load_model=True)\n", + " print(conf)\n", + " print(stats)\n", + " print(model)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "def load_experiment_metafile(path: PathLike, init_task_if_not_exists: Optional[List[Any]] = None) -> Dict[str, List[Any]]:\n", + " \"\"\"Load meta data of all experiments\n", + " \n", + " Args:\n", + " path: Path to meta file.\n", + " init_task_if_not_exists: Initialize meta file if not exists.\n", + "\n", + " Returns:\n", + " meta: Dictionary of meta data.\n", + " \"\"\"\n", + " # load previous experiments if any exist\n", + " try:\n", + " with open(path, 'r') as f:\n", + " experiments = json.load(f)\n", + " except FileNotFoundError:\n", + " experiments = {\n", + " \"remain_experiments\": [],\n", + " \"completed_experiment_results\": [] # list of completed experiment\n", + " }\n", + " if init_task_if_not_exists is not None:\n", + " experiments[\"remain_experiments\"] = init_task_if_not_exists.copy() # list of remaining experiment\n", + " return experiments\n", + "\n", + "def save_experiment_metafile(path: PathLike, experiments: Dict[str, Any]):\n", + " \"\"\"Save meta data of all experiments\n", + " \n", + " Args:\n", + " path: Path to meta file.\n", + " experiments: Dictionary of meta data. \n", + " \"\"\"\n", + " with open(path, 'w') as f:\n", + " json.dump(experiments, f, indent=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 문제" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. 기본적인 일반화 (basic generalization): 코드에 주어진 hyperparameter 들을 이용하여 신경망을 학습시킨다. 학습 오차(training error)와 일반화를 위한 검증 오차(validation error) 결과가 어떻게 다른지 설명한다. 두 가지 경우(학습과 일반화 검증)에 대해 오차 커브(error curve)를 그래프로 제시하시오." + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking gradients...\n", + "Done.\n", + "training...\n" + ] + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "window.appendLearningCurve([{\"x\": 124.0, \"y\": {\"Cross entropy\": {\"Train\": 1.8584183756742794, \"Validation\": 1.8472406692662053}, \"Accuracy\": {\"Train\": 0.23, \"Validation\": 0.27923627684964203}}}]);", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn [121], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m main()\n", + "Cell \u001b[1;32mIn [116], line 67\u001b[0m, in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[39m# Train model.\u001b[39;00m\n\u001b[0;32m 66\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mtraining...\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m---> 67\u001b[0m trained_model, stats \u001b[39m=\u001b[39m Train(model, NNForward, NNBackward, NNUpdate, eps,\n\u001b[0;32m 68\u001b[0m momentum, num_epochs, batch_size)\n\u001b[0;32m 70\u001b[0m plt\u001b[39m.\u001b[39mfigure(\u001b[39m0\u001b[39m)\n\u001b[0;32m 71\u001b[0m plt\u001b[39m.\u001b[39mplot(np\u001b[39m.\u001b[39marray(stats[\u001b[39m'\u001b[39m\u001b[39mtrain_ce\u001b[39m\u001b[39m'\u001b[39m])[:, \u001b[39m0\u001b[39m], np\u001b[39m.\u001b[39marray(stats[\u001b[39m'\u001b[39m\u001b[39mtrain_ce\u001b[39m\u001b[39m'\u001b[39m])[:, \u001b[39m1\u001b[39m], \u001b[39m'\u001b[39m\u001b[39mb\u001b[39m\u001b[39m'\u001b[39m, label\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mTrain\u001b[39m\u001b[39m'\u001b[39m)\n", + "Cell \u001b[1;32mIn [53], line 68\u001b[0m, in \u001b[0;36mTrain\u001b[1;34m(model, forward, backward, update, eps, momentum, num_epochs, batch_size)\u001b[0m\n\u001b[0;32m 65\u001b[0m error \u001b[39m=\u001b[39m (prediction \u001b[39m-\u001b[39m t) \u001b[39m/\u001b[39m x\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m]\n\u001b[0;32m 67\u001b[0m \u001b[39m# Backward prop.\u001b[39;00m\n\u001b[1;32m---> 68\u001b[0m grads \u001b[39m=\u001b[39m backward(model, error, var)\n\u001b[0;32m 70\u001b[0m \u001b[39m# Update weights.\u001b[39;00m\n\u001b[0;32m 71\u001b[0m update(model, eps, momentum, optimizer_state, grads)\n", + "Cell \u001b[1;32mIn [51], line 15\u001b[0m, in \u001b[0;36mNNBackward\u001b[1;34m(model, err, var)\u001b[0m\n\u001b[0;32m 13\u001b[0m dE_dh1r, dE_dW2, dE_db2 \u001b[39m=\u001b[39m AffineBackward(dE_dh2, var[\u001b[39m'\u001b[39m\u001b[39mh1r\u001b[39m\u001b[39m'\u001b[39m], model[\u001b[39m'\u001b[39m\u001b[39mW2\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[0;32m 14\u001b[0m dE_dh1 \u001b[39m=\u001b[39m ReLUBackward(dE_dh1r, var[\u001b[39m'\u001b[39m\u001b[39mh1\u001b[39m\u001b[39m'\u001b[39m], var[\u001b[39m'\u001b[39m\u001b[39mh1r\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[1;32m---> 15\u001b[0m _, dE_dW1, dE_db1 \u001b[39m=\u001b[39m AffineBackward(dE_dh1, var[\u001b[39m'\u001b[39;49m\u001b[39mx\u001b[39;49m\u001b[39m'\u001b[39;49m], model[\u001b[39m'\u001b[39;49m\u001b[39mW1\u001b[39;49m\u001b[39m'\u001b[39;49m])\n\u001b[0;32m 17\u001b[0m grads \u001b[39m=\u001b[39m {}\n\u001b[0;32m 18\u001b[0m grads[\u001b[39m'\u001b[39m\u001b[39mW1\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m dE_dW1\n", + "Cell \u001b[1;32mIn [49], line 14\u001b[0m, in \u001b[0;36mAffineBackward\u001b[1;34m(grad_y, x, w)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mAffineBackward\u001b[39m(grad_y: np\u001b[39m.\u001b[39mndarray, x: np\u001b[39m.\u001b[39mndarray, w: np\u001b[39m.\u001b[39mndarray) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tuple[np\u001b[39m.\u001b[39mndarray, np\u001b[39m.\u001b[39mndarray, np\u001b[39m.\u001b[39mndarray]:\n\u001b[0;32m 2\u001b[0m \u001b[39m\"\"\"Computes gradients of affine transformation.\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \n\u001b[0;32m 4\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[39m grad_b: Gradients wrt. the biases.\u001b[39;00m\n\u001b[0;32m 13\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 14\u001b[0m grad_x \u001b[39m=\u001b[39m grad_y\u001b[39m.\u001b[39;49mdot(w\u001b[39m.\u001b[39;49mT)\n\u001b[0;32m 15\u001b[0m grad_w \u001b[39m=\u001b[39m x\u001b[39m.\u001b[39mT\u001b[39m.\u001b[39mdot(grad_y)\n\u001b[0;32m 16\u001b[0m grad_b \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39msum(grad_y, axis\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "main()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "학습 오차(training error)와 일반화를 위한 검증 오차(validation error) 결과는 다음과 같습니다.\n", + "\n", + "```\n", + "CE: Train 0.25610 Validation 0.97890 Test 0.78023\n", + "Acc: Train 0.90486 Validation 0.73986 Test 0.77143\n", + "```\n", + "\n", + "그래프는 다음과 같습니다.\n", + "\n", + "![loss_graph](./defaultLossGraph.png)\n", + "![accuracy_graph](./defaultAccuracyGraph.png)\n", + "\n", + "학습 오차가 크게 감소하고 일반화 검증 오차는 점점 증가하는 것을 확인할 수 있습니다." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. 최적화 (optimization): Learning rate, momentum, mini-batch size 세 가지 종류의 parameter 들을 아래와 같이 변화시키면서 다양한 조합들에 대해 신경망이 cross-entropy 관점에서 어떻게 수렴하는지 살펴본다. 가장 우수한 성능을 나타내는 hyperparameter 들의 조합이 어떤 것인지 제시하시오. (모든 경우의 수를 다 따지면 75 가지 신경망 모델을 테스트해야 하나 시간이 너무 많이 결릴 수 있으므로 이 중에서 일부분의 경우들만 테스트해도 된다. 그러나 어떤 근거로 해당 조합들만 테스트했는지 적당한 설명이 있어야 함.)\n", + " - Learning rate ( $\\epsilon$ ): 0.001 에서 1.0 사이의 5 가지 경우\n", + " - Momentum: 0.0 에서 0.9 사이의 3 가지 경우\n", + " - Mini-batch size: 1 에서 1000 까지의 5 가지 경우\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 실험 코드" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1, 2, 7, 14, 241, 482, 1687, 3374}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import itertools\n", + "# 3374 can be factorized into 2 * 7 * 241\n", + "factor = set([1, 2, 7, 241])\n", + "# make all the multiplication of combinations of factors\n", + "combinations = [set(np.prod(x) for x in itertools.combinations(factor, i)) for i in range(1, len(factor)+1)]\n", + "combinations = set.union(*combinations)\n", + "combinations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "나머지가 없는 Mini-batch size는 1, 2, 7, 14, 241, 482, 1687, 3374 입니다." + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[3.9810717055349727,\n", + " 15.848931924611136,\n", + " 63.095734448019314,\n", + " 251.1886431509581,\n", + " 1000.0]" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[1000**(i/5) for i in range(1, 6)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1000을 배수단위로 5등분을 하면 3.98, 15.84, 63.09, 251.18, 1000 이 됩니다." + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(65, 59, 0.09230769230769231),\n", + " (69, 62, 0.10144927536231885),\n", + " (64, 46, 0.28125),\n", + " (68, 42, 0.38235294117647056),\n", + " (63, 35, 0.4444444444444444),\n", + " (62, 26, 0.5806451612903226),\n", + " (67, 24, 0.6417910447761194),\n", + " (61, 19, 0.6885245901639344),\n", + " (56, 14, 0.75),\n", + " (60, 14, 0.7666666666666667)]" + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# pick top 10\n", + "center, width = 63, 15\n", + "cand = [(i,3374 % i, (i - 3374 % i) / i) for i in range(center-(width//2), center+(width//2))]\n", + "cand = sorted(cand, key=lambda x: x[2])\n", + "cand[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "그래서 2, 14, 65, 241, 844를 선택하였습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "ExperimentMLP() missing 1 required positional argument: 'title'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn [20], line 12\u001b[0m\n\u001b[0;32m 1\u001b[0m conf \u001b[39m=\u001b[39m Config(\n\u001b[0;32m 2\u001b[0m num_inputs\u001b[39m=\u001b[39m\u001b[39m2304\u001b[39m,\n\u001b[0;32m 3\u001b[0m num_hiddens\u001b[39m=\u001b[39m[\u001b[39m16\u001b[39m, \u001b[39m32\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 10\u001b[0m patience\u001b[39m=\u001b[39m\u001b[39m100\u001b[39m,\n\u001b[0;32m 11\u001b[0m )\n\u001b[1;32m---> 12\u001b[0m _, stat \u001b[39m=\u001b[39m ExperimentMLP(conf, save_dir\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mresults/example_lr=0.5\u001b[39;49m\u001b[39m'\u001b[39;49m, show\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, pplot\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[0;32m 13\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mTest accuracy: \u001b[39m\u001b[39m{\u001b[39;00mstat\u001b[39m.\u001b[39mtest_acc\u001b[39m:\u001b[39;00m\u001b[39m.4f\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m, Test cross entropy: \u001b[39m\u001b[39m{\u001b[39;00mstat\u001b[39m.\u001b[39mtest_ce\u001b[39m:\u001b[39;00m\u001b[39m.4f\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 14\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mDone\u001b[39m\u001b[39m\"\u001b[39m)\n", + "\u001b[1;31mTypeError\u001b[0m: ExperimentMLP() missing 1 required positional argument: 'title'" + ] + } + ], + "source": [ + "conf = Config(\n", + " num_inputs=2304,\n", + " num_hiddens=[16, 32],\n", + " num_outputs=7,\n", + " eps=0.5,\n", + " momentum=0.0,\n", + " num_epochs=1000,\n", + " batch_size=844,\n", + " early_stopping=True,\n", + " patience=100,\n", + ")\n", + "_, stat = ExperimentMLP(conf, save_dir='results/example_lr=0.5', show=True, pplot=True)\n", + "print(f\"Test accuracy: {stat.test_acc:.4f}, Test cross entropy: {stat.test_ce:.4f}\")\n", + "print(\"Done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "위 셀에서 lr을 변경해서 나올 수 있는 결과는 다음과 같습니다.\n", + "\n", + "![lr1.0plot](./lr1plot.png)\n", + "\n", + "> lr 1.0 plot\n", + " \n", + "![lr0.5plot](./lr0.5plot.png)\n", + "\n", + "> lr 0.5 plot \n", + "\n", + "다음 그래프를 보면 알 수 있듯 배치를 아무리 높여도 lr = 0.5 일때 최적화가 잘 될 수 없다고 판단 하였습니다. 그래서 제외하였습니다. 1.0도 마찬가지이므로 제외하였습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "conf = Config(\n", + " num_inputs=2304,\n", + " num_hiddens=[16, 32],\n", + " num_outputs=7,\n", + " eps=0.01,\n", + " momentum=0.0,\n", + " num_epochs=1000,\n", + " batch_size=100,\n", + " early_stopping=True,\n", + " patience=50,\n", + ")\n", + "\n", + "# Grid search for hyperparameters.\n", + "lr_candidates = [0.1, 0.05, 0.01, 0.005, 0.001]\n", + "momentum_candidates = [0.0, 0.5, 0.9]\n", + "mini_batch_size_candidates = [2, 14, 65, 241, 844]\n", + "\n", + "# Make all combinations of hyperparameters.\n", + "import itertools\n", + "\n", + "experiments_list = [*itertools.product(lr_candidates, momentum_candidates, mini_batch_size_candidates)]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All experiments completed\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "experiments = load_experiment_metafile('experiments.json', \n", + " init_task_if_not_exists=experiments_list)\n", + "\n", + "if len(experiments[\"remain_experiments\"]) == 0:\n", + " print(\"All experiments completed\")\n", + "\n", + "# Run experiments.\n", + "# tqdm nested progress bar is not working in my jupyter notebook\n", + "# so I just print the progress\n", + "while len(experiments['remain_experiments']) > 0:\n", + " # get next experiment\n", + " lr, momentum, mini_batch_size = experiments['remain_experiments'].pop(0)\n", + " # set experiment directory\n", + " save_dir = f\"results/lr={lr}_momentum={momentum}_batch_size={mini_batch_size}\"\n", + " # create experiment config\n", + " conf.eps = lr\n", + " conf.momentum = momentum\n", + " conf.batch_size = mini_batch_size\n", + " # print experiment parameters\n", + " print(f\"Experiment: lr={lr}, momentum={momentum}, batch_size={mini_batch_size}\")\n", + " \n", + " start = time.time()\n", + " # run experiment\n", + " ExperimentMLP(conf, save_dir=save_dir, show=False)\n", + " \n", + " end = time.time()\n", + " # add experiment to completed experiments\n", + " experiments['completed_experiment_results'].append({\n", + " \"lr\": lr,\n", + " \"momentum\": momentum,\n", + " \"mini_batch_size\": mini_batch_size,\n", + " \"save_dir\": save_dir,\n", + " \"time\": end - start\n", + " })\n", + " # print completed experiments , remaining experiments and time taken\n", + " print(\"\\n\".join([f\"Completed experiments: {len(experiments['completed_experiment_results'])}\",\n", + " f\"Remaining experiments: {len(experiments['remain_experiments'])}\",\n", + " f\"Time taken: {end - start:.2f} seconds\"]))\n", + " # save experiments\n", + " save_experiment_metafile('experiments.json', experiments)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "위의 셀들를 실행하면 실험을 진행할 수 있습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "def get_lbm_experiment_results(experiments: Dict[str, Any], use_tqdm: bool = False):\n", + " \"\"\"Get experiment results from meta data\"\"\"\n", + " experiments_results = experiments['completed_experiment_results']\n", + " if use_tqdm:\n", + " experiments_results = tqdm(experiments_results)\n", + " results = []\n", + " for experiment in experiments_results:\n", + " # load experiment statistics\n", + " _, stat, _ = load_experiment(experiment['save_dir'], load_model=False)\n", + " i, best_valid_acc = stat.best_valid_acc()\n", + " # add experiment parameters and statistics to results\n", + " results.append({\n", + " \"lr\": experiment['lr'],\n", + " \"momentum\": experiment['momentum'],\n", + " \"mini_batch_size\": experiment['mini_batch_size'],\n", + " \"test_acc\": stat.test_acc,\n", + " \"test_ce\": stat.test_ce,\n", + " \"train_acc\": stat.train_acc[i][1],\n", + " \"train_ce\": stat.train_ce[i][1],\n", + " \"valid_acc\": best_valid_acc,\n", + " \"valid_ce\": stat.valid_ce[i][1],\n", + " \"time\": experiment['time']\n", + " })\n", + " return results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 결과" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 75/75 [00:00<00:00, 612.94it/s]\n" + ] + } + ], + "source": [ + "# load experiments\n", + "experiments = load_experiment_metafile('experiments.json')\n", + "# get experiment results\n", + "results = get_lbm_experiment_results(experiments, use_tqdm=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "#### Momentum = 0.0\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|28.16|44.87|75.18|76.37|75.42|\n", + "|14|71.12|74.22|77.57|76.37|72.32|\n", + "|65|74.46|76.61|74.46|74.70|66.11|\n", + "|241|65.16|73.27|73.51|66.83|47.26|\n", + "|844|50.60|68.97|63.48|55.61|28.16|" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "#### Momentum = 0.5\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|27.92|35.08|72.55|76.61|75.66|\n", + "|14|47.26|70.41|74.46|77.33|74.22|\n", + "|65|53.22|67.78|75.42|74.94|69.69|\n", + "|241|47.97|67.54|72.55|71.12|59.43|\n", + "|844|50.12|64.20|70.64|63.96|27.92|" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "#### Momentum = 0.9\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|27.92|27.92|27.92|47.73|73.51|\n", + "|14|27.92|27.92|69.45|72.32|76.37|\n", + "|65|27.92|47.73|74.46|75.89|73.27|\n", + "|241|32.70|58.47|75.66|73.51|71.84|\n", + "|844|35.80|53.46|72.79|71.36|60.86|" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display, HTML, Markdown\n", + "\n", + "for m in momentum_candidates:\n", + " markdown_table_content = []\n", + " markdown_table_content.append(\"| |\" + \"|\".join(map(str, lr_candidates)) + \"|\")\n", + " markdown_table_content.append(\"|\" + \"|\".join([\"---\"] * (len(lr_candidates) + 1)) + \"|\")\n", + " for batch in mini_batch_size_candidates:\n", + " inner_content = []\n", + " markdown_table_content.append(\"|\" + str(batch) + \"|\")\n", + " for lr in lr_candidates:\n", + " # filter results\n", + " filtered_results = [result for result in results if result['lr'] == lr and result['momentum'] == m and result['mini_batch_size'] == batch]\n", + " if len(filtered_results) == 1:\n", + " result = filtered_results[0]\n", + " inner_content.append(f\"{result['valid_acc'] * 100:.2f}\")\n", + " markdown_table_content[-1] += \"|\".join(inner_content) + \"|\"\n", + " display(Markdown(f\"#### Momentum = {m}\\n\" + \"\\n\".join(markdown_table_content)))\n", + " # print(f\"#### Momentum = {m}\\n\"+\"\\n\".join(markdown_table_content))\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Momentum = 0.0\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|28.16|44.87|75.18|76.37|75.42|\n", + "|14|71.12|74.22|77.57|76.37|72.32|\n", + "|65|74.46|76.61|74.46|74.70|66.11|\n", + "|241|65.16|73.27|73.51|66.83|47.26|\n", + "|844|50.60|68.97|63.48|55.61|28.16|\n", + "\n", + "##### Momentum = 0.5\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|27.92|35.08|72.55|76.61|75.66|\n", + "|14|47.26|70.41|74.46|77.33|74.22|\n", + "|65|53.22|67.78|75.42|74.94|69.69|\n", + "|241|47.97|67.54|72.55|71.12|59.43|\n", + "|844|50.12|64.20|70.64|63.96|27.92|\n", + "\n", + "##### Momentum = 0.9\n", + "| |0.1|0.05|0.01|0.005|0.001|\n", + "|---|---|---|---|---|---|\n", + "|2|27.92|27.92|27.92|47.73|73.51|\n", + "|14|27.92|27.92|69.45|72.32|76.37|\n", + "|65|27.92|47.73|74.46|75.89|73.27|\n", + "|241|32.70|58.47|75.66|73.51|71.84|\n", + "|844|35.80|53.46|72.79|71.36|60.86|\n", + "\n", + "다음과 같은 결과가 나왔습니다." + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from typing import Callable\n", + "import matplotlib.tri as tri\n", + "\n", + "def plot_results(x_values: Union[List[float], np.ndarray],\n", + " y_values: Union[List[float], np.ndarray],\n", + " z_values: Union[List[float], np.ndarray],\n", + " xlabel: str, \n", + " ylabel: str,\n", + " title: str,\n", + " contour_levels: int = 14,\n", + " figsize: Tuple[int, int] = (10, 10)):\n", + " \"\"\"Plot experiment results\n", + " \n", + " Args:\n", + " results: list of experiment results\n", + " x_values: x list or array\n", + " y_values: y list or array\n", + " z_values: z list or array\n", + " title: plot title\n", + " xlabel: x axis label\n", + " ylabel: y axis label\n", + " contour_levels: number of contour levels\n", + " figsize: figure size\n", + " \"\"\"\n", + " \n", + " plt.figure(figsize=figsize)\n", + " plt.title(title)\n", + " plt.xlabel(xlabel)\n", + " plt.ylabel(ylabel)\n", + " plt.scatter(x_values, y_values, c=z_values, cmap='viridis')\n", + " plt.colorbar()\n", + "\n", + " # create triangulation\n", + " triang = tri.Triangulation(x_values, y_values)\n", + "\n", + " # interpolate data\n", + " interpolator = tri.LinearTriInterpolator(triang, z_values)\n", + " xi = np.linspace(min(x_values), max(x_values), 100)\n", + " yi = np.linspace(min(y_values), max(y_values), 100)\n", + " Xi, Yi = np.meshgrid(xi, yi)\n", + " zi = interpolator(Xi, Yi)\n", + "\n", + " # plot contour\n", + " plt.contour(xi, yi, zi, colors='k', levels=contour_levels, linewidths=0.5, alpha=0.5)\n", + " plt.contourf(xi, yi, zi, levels=contour_levels, cmap='viridis', alpha=0.5)\n", + " plt.show()\n", + "\n", + "for m in momentum_candidates:\n", + " x_values = np.log([r['lr'] for r in results if r['momentum'] == m])\n", + " y_values = np.log([r['mini_batch_size'] for r in results if r['momentum'] == m])\n", + " z_values = [r['valid_acc'] for r in results if r['momentum'] == m]\n", + "\n", + " plot_results(\n", + " x_values=x_values,\n", + " y_values=y_values,\n", + " z_values=z_values,\n", + " title=f'momentum {m} valid accuracy',\n", + " xlabel='Logged Learning rate',\n", + " ylabel='Logged mini batch size',\n", + " contour_levels=10,\n", + " figsize=(5, 5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "위 셀을 실행하면 다음과 같은 결과가 나옵니다.\n", + "\n", + "![batch_lr_graph](./llr_lbs_m0_v_acc.png)\n", + "![batch_lr_graph](./llr_lbs_m5_v_acc.png)\n", + "![batch_lr_graph](./llr_lbs_m9_v_acc.png)\n", + "\n", + "이 그래프를 보면 알 수 있듯이 learning rate와 mini-batch size는 서로 비례 관계에 있습니다. learning rate가 커지면 mini-batch size도 커져야 좋은 결과를 얻을 수 있습니다. 그리고 momentum은 batch size를 크게 해도 성능이 떨어지지 않게 해주는 것을 알 수 있습니다. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "65, 0.01, 0.5의 조합이 가장 최적으로 보입니다." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. 신경망 모델 구조 변경: Momentum 을 0.9로 고정시킨 상태에서 신경망의 hidden unit 들의 갯수를 2 에서 100 사이의 3 가지 다른 경우에 대해 성능을 비교한다. 필요한 경우 learning rate 와 학습 기간(epochs)은 신경망 구조에 따라 적당하게 변경할 수 있다. Hidden unit 의 갯수들이 학습에서의 수렴과 신경망의 일반화 성는에 미치는 영향에 대한 데이터(표나 그래프)를 제시하고 경향을 분석하시오." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 실험 코드" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conf = Config(\n", + " num_inputs=2304,\n", + " num_hiddens=[16, 32],\n", + " num_outputs=7,\n", + " eps=0.01,\n", + " momentum=0.9,\n", + " num_epochs=1000,\n", + " batch_size=241,\n", + " early_stopping=True,\n", + " patience=50,\n", + ")\n", + "\n", + "num_hidden_candidates = [2, 4, 8, 16, 32, 64, 100]\n", + "hidden_candidates = itertools.product(num_hidden_candidates, num_hidden_candidates)\n", + "\n", + "experiments_list = load_experiment_metafile('experiments_hidden.json',\n", + " init_task_if_not_exists=hidden_candidates)\n", + "\n", + "while len(experiments_list) > 0:\n", + " num_hiddens = experiments_list[\"remain_experiments\"].pop()\n", + "\n", + " print(f\"Running experiment with {num_hiddens} hidden units\")\n", + " conf.num_hiddens = num_hiddens\n", + "\n", + " save_dir = f\"results_hidden/{num_hiddens[0]}_{num_hiddens[1]}\"\n", + " _, stat, _ = ExperimentMLP(conf, title=f\"hidden {num_hiddens}\", show=False, \n", + " save_dir=save_dir)\n", + " \n", + " i, best_valid_acc = stat.best_valid_acc()\n", + " experiments_list[\"completed_experiment_results\"].append({\n", + " \"num_hiddens\": num_hiddens,\n", + " \"save_dir\": save_dir,\n", + " \"test_acc\": stat.test_acc,\n", + " \"test_ce\": stat.test_ce,\n", + " \"train_acc\": stat.train_acc[i][1],\n", + " \"train_ce\": stat.train_ce[i][1],\n", + " \"valid_acc\": best_valid_acc,\n", + " \"valid_ce\": stat.valid_ce[i][1],\n", + " \"time\": stat.time\n", + " })\n", + " save_experiment_metafile('experiments_hidden.json', experiments_list)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.2 ('hw3': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "vscode": { + "interpreter": { + "hash": "82fd07bec16cb4479257adc108d4dc98de3f270fc95dcdba0cb0fb16f10a7c36" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..7b76b07 Binary files /dev/null and b/requirement.txt differ diff --git a/toronto_face.npz b/toronto_face.npz new file mode 100644 index 0000000..054fec6 Binary files /dev/null and b/toronto_face.npz differ