diff --git a/Nam/Final_code 1.py b/Nam/Final_code 1.py new file mode 100644 index 0000000..e2a381f --- /dev/null +++ b/Nam/Final_code 1.py @@ -0,0 +1,45 @@ + +import pandas as pd +import pickle +from tensorflow.keras.models import load_model +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from url_preprocessing import preprocess_url_dataframe # 너가 만든 전처리 모듈 + +# 1. 원본 데이터 불러오기 +df = pd.read_csv("train.csv") # 또는 적절한 파일명으로 수정 +print("원본 데이터 불러옴") + +# 2. 전처리 적용 +df_processed = preprocess_url_dataframe(df) +print(" 전처리 완료") + +# 3. 피처/레이블 분리 +X = df_processed.drop(columns=['label', 'URL', 'URL_clean'], errors='ignore') # 'label' 없으면 자동 무시 +y = df_processed['label'] if 'label' in df_processed.columns else None + +# 4. 스케일러 불러오기 +with open("scaler.pkl", "rb") as f: + scaler = pickle.load(f) +X_scaled = scaler.transform(X) +print(" 스케일링 완료") + +# 5. 모델 불러오기 +model = load_model("best_model.h5") +print(" 모델 불러오기 완료") + +# 6. 예측 +y_pred_proba = model.predict(X_scaled).ravel() +best_threshold = 0.34 # 여기에 저장된 값이 있다면 pickle로 불러올 수 있음 + +y_pred = (y_pred_proba > best_threshold).astype(int) + +# 7. 결과 출력 +if y is not None: + print("예측 결과 (테스트셋 평가)") + print("Accuracy:", accuracy_score(y, y_pred)) + print("F1 Score:", f1_score(y, y_pred)) + print("Precision:", precision_score(y, y_pred)) + print("Recall:", recall_score(y, y_pred)) +else: + print("예측 완료! 라벨이 없어 평가 생략") + print("예측 결과 샘플:", y_pred[:10]) diff --git a/Nam/best_model 1.h5 b/Nam/best_model 1.h5 new file mode 100644 index 0000000..ae3950d Binary files /dev/null and b/Nam/best_model 1.h5 differ diff --git a/Nam/scaler 1.pkl b/Nam/scaler 1.pkl new file mode 100644 index 0000000..65ea96a Binary files /dev/null and b/Nam/scaler 1.pkl differ diff --git a/jun/code.ipynb b/jun/code.ipynb index 83501d9..417dd2e 100755 --- a/jun/code.ipynb +++ b/jun/code.ipynb @@ -2392,703 +2392,56 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [ { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "object", - "type": "string" - }, - { - "name": "label", - "rawType": "float64", - "type": "float" - }, - { - "name": "url_length_cat", - "rawType": "float64", - "type": "float" - }, - { - "name": "num_dots", - "rawType": "float64", - "type": "float" - }, - { - "name": "num_digits", - "rawType": "float64", - "type": "float" - }, - { - "name": "num_special_chars", - "rawType": "float64", - "type": "float" - }, - { - "name": "url_keyword", - "rawType": "float64", - "type": "float" - }, - { - "name": "num_underbar", - "rawType": "float64", - "type": "float" - }, - { - "name": "extract_consecutive_numbers", - "rawType": "float64", - "type": "float" - }, - { - "name": "number", - "rawType": "float64", - "type": "float" - }, - { - "name": "upper", - "rawType": "float64", - "type": "float" - }, - { - "name": "is_common_tld", - "rawType": "float64", - "type": "float" - }, - { - "name": "is_country_tld", - "rawType": "float64", - "type": "float" - }, - { - "name": "is_suspicious_tld", - "rawType": "float64", - "type": "float" - }, - { - "name": "domain_length", - "rawType": "float64", - "type": "float" - }, - { - "name": "has_subdomain", - "rawType": "float64", - "type": "float" - }, - { - "name": "subdomain_length", - "rawType": "float64", - "type": "float" - }, - { - "name": "subdomain_count", - "rawType": "float64", - "type": "float" - }, - { - "name": "path_depth", - "rawType": "float64", - "type": "float" - }, - { - "name": "has_query", - "rawType": "float64", - "type": "float" - }, - { - "name": "query_length", - "rawType": "float64", - "type": "float" - }, - { - "name": "query_param_count", - "rawType": "float64", - "type": "float" - }, - { - "name": "url_shorteners", - "rawType": "float64", - "type": "float" - }, - { - "name": "compression_ratio", - "rawType": "float64", - "type": "float" - }, - { - "name": "entropy", - "rawType": "float64", - "type": "float" - }, - { - "name": "digit_ratio", - "rawType": "float64", - "type": "float" - }, - { - "name": "special_char_ratio", - "rawType": "float64", - "type": "float" - } - ], - "conversionMethod": "pd.DataFrame", - "ref": "c79a077e-8e52-4e42-b88f-dc9698b0fa30", - "rows": [ - [ - "count", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0", - "6995056.0" - ], - [ - "mean", - "0.22371472079708868", - "1.4435534183000107", - "1.546944584861079", - "1.6343590387267808", - "2.6635716711917676", - "0.0370789025849114", - "0.045005501028154746", - "0.056463736673444787", - "0.08128040719044995", - "0.0357764112252997", - "0.6133649251700057", - "0.12739140329970197", - "0.022784949827420967", - "10.464007150192936", - "0.21130266862767075", - "2.43731000866898", - "0.2660177416735477", - "0.6056849294701858", - "0.027221368921135157", - "1.9155892390282507", - "0.04228915393958247", - "0.0018421582329004942", - "1.4552534994784176", - "3.5360434022769756", - "0.029042428345387533", - "0.1102289088601276" - ], - [ - "std", - "0.41673309122602675", - "1.1161203432813147", - "1.010078604927829", - "9.827940363271033", - "7.1618457272654", - "0.18895518694176003", - "0.6023702991784359", - "0.23081505741717664", - "0.273265280035072", - "0.18573223887275842", - "0.4869788780260291", - "0.33341093196934307", - "0.14921728811320575", - "5.0652546813544035", - "0.4082326232468674", - "6.90096602515224", - "0.6272395647222854", - "1.6003209664806863", - "0.1627279010519657", - "19.702068343354906", - "0.35208851309719974", - "0.04288082262284407", - "0.24856536988340924", - "0.47898938276414027", - "0.08255957016074264", - "0.046338026902092454" - ], - [ - "min", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.010181818181818183", - "-0.0", - "0.0", - "0.0" - ], - [ - "25%", - "0.0", - "0.0", - "1.0", - "0.0", - "1.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "7.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "1.3076923076923077", - "3.238901256602631", - "0.0", - "0.07142857142857142" - ], - [ - "50%", - "0.0", - "1.0", - "1.0", - "0.0", - "2.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "1.0", - "0.0", - "0.0", - "10.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "1.4444444444444444", - "3.5068905956085183", - "0.0", - "0.10344827586206896" - ], - [ - "75%", - "0.0", - "2.0", - "2.0", - "0.0", - "3.0", - "0.0", - "0.0", - "0.0", - "0.0", - "0.0", - "1.0", - "0.0", - "0.0", - "13.0", - "0.0", - "0.0", - "0.0", - "1.0", - "0.0", - "0.0", - "0.0", - "0.0", - "1.6153846153846154", - "3.7962176025900556", - "0.0", - "0.14285714285714285" - ], - [ - "max", - "1.0", - "3.0", - "171.0", - "2011.0", - "8198.0", - "1.0", - "136.0", - "1.0", - "1.0", - "1.0", - "1.0", - "1.0", - "1.0", - "63.0", - "1.0", - "237.0", - "38.0", - "136.0", - "1.0", - "8367.0", - "131.0", - "1.0", - "5.0", - "6.570554108088201", - "0.9545454545454546", - "1.0" - ] - ], - "shape": { - "columns": 26, - "rows": 8 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labelurl_length_catnum_dotsnum_digitsnum_special_charsurl_keywordnum_underbarextract_consecutive_numbersnumberupper...subdomain_countpath_depthhas_queryquery_lengthquery_param_counturl_shortenerscompression_ratioentropydigit_ratiospecial_char_ratio
count6.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+06...6.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+066.995056e+06
mean2.237147e-011.443553e+001.546945e+001.634359e+002.663572e+003.707890e-024.500550e-025.646374e-028.128041e-023.577641e-02...2.660177e-016.056849e-012.722137e-021.915589e+004.228915e-021.842158e-031.455253e+003.536043e+002.904243e-021.102289e-01
std4.167331e-011.116120e+001.010079e+009.827940e+007.161846e+001.889552e-016.023703e-012.308151e-012.732653e-011.857322e-01...6.272396e-011.600321e+001.627279e-011.970207e+013.520885e-014.288082e-022.485654e-014.789894e-018.255957e-024.633803e-02
min0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.018182e-02-0.000000e+000.000000e+000.000000e+00
25%0.000000e+000.000000e+001.000000e+000.000000e+001.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.307692e+003.238901e+000.000000e+007.142857e-02
50%0.000000e+001.000000e+001.000000e+000.000000e+002.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.444444e+003.506891e+000.000000e+001.034483e-01
75%0.000000e+002.000000e+002.000000e+000.000000e+003.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.000000e+001.000000e+000.000000e+000.000000e+000.000000e+000.000000e+001.615385e+003.796218e+000.000000e+001.428571e-01
max1.000000e+003.000000e+001.710000e+022.011000e+038.198000e+031.000000e+001.360000e+021.000000e+001.000000e+001.000000e+00...3.800000e+011.360000e+021.000000e+008.367000e+031.310000e+021.000000e+005.000000e+006.570554e+009.545455e-011.000000e+00
\n", - "

8 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " label url_length_cat num_dots num_digits \\\n", - "count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n", - "mean 2.237147e-01 1.443553e+00 1.546945e+00 1.634359e+00 \n", - "std 4.167331e-01 1.116120e+00 1.010079e+00 9.827940e+00 \n", - "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "25% 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n", - "50% 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 \n", - "75% 0.000000e+00 2.000000e+00 2.000000e+00 0.000000e+00 \n", - "max 1.000000e+00 3.000000e+00 1.710000e+02 2.011000e+03 \n", - "\n", - " num_special_chars url_keyword num_underbar \\\n", - "count 6.995056e+06 6.995056e+06 6.995056e+06 \n", - "mean 2.663572e+00 3.707890e-02 4.500550e-02 \n", - "std 7.161846e+00 1.889552e-01 6.023703e-01 \n", - "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "25% 1.000000e+00 0.000000e+00 0.000000e+00 \n", - "50% 2.000000e+00 0.000000e+00 0.000000e+00 \n", - "75% 3.000000e+00 0.000000e+00 0.000000e+00 \n", - "max 8.198000e+03 1.000000e+00 1.360000e+02 \n", - "\n", - " extract_consecutive_numbers number upper ... \\\n", - "count 6.995056e+06 6.995056e+06 6.995056e+06 ... \n", - "mean 5.646374e-02 8.128041e-02 3.577641e-02 ... \n", - "std 2.308151e-01 2.732653e-01 1.857322e-01 ... \n", - "min 0.000000e+00 0.000000e+00 0.000000e+00 ... \n", - "25% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n", - "50% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n", - "75% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n", - "max 1.000000e+00 1.000000e+00 1.000000e+00 ... \n", - "\n", - " subdomain_count path_depth has_query query_length \\\n", - "count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n", - "mean 2.660177e-01 6.056849e-01 2.722137e-02 1.915589e+00 \n", - "std 6.272396e-01 1.600321e+00 1.627279e-01 1.970207e+01 \n", - "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", - "75% 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 \n", - "max 3.800000e+01 1.360000e+02 1.000000e+00 8.367000e+03 \n", - "\n", - " query_param_count url_shorteners compression_ratio entropy \\\n", - "count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n", - "mean 4.228915e-02 1.842158e-03 1.455253e+00 3.536043e+00 \n", - "std 3.520885e-01 4.288082e-02 2.485654e-01 4.789894e-01 \n", - "min 0.000000e+00 0.000000e+00 1.018182e-02 -0.000000e+00 \n", - "25% 0.000000e+00 0.000000e+00 1.307692e+00 3.238901e+00 \n", - "50% 0.000000e+00 0.000000e+00 1.444444e+00 3.506891e+00 \n", - "75% 0.000000e+00 0.000000e+00 1.615385e+00 3.796218e+00 \n", - "max 1.310000e+02 1.000000e+00 5.000000e+00 6.570554e+00 \n", - "\n", - " digit_ratio special_char_ratio \n", - "count 6.995056e+06 6.995056e+06 \n", - "mean 2.904243e-02 1.102289e-01 \n", - "std 8.255957e-02 4.633803e-02 \n", - "min 0.000000e+00 0.000000e+00 \n", - "25% 0.000000e+00 7.142857e-02 \n", - "50% 0.000000e+00 1.034483e-01 \n", - "75% 0.000000e+00 1.428571e-01 \n", - "max 9.545455e-01 1.000000e+00 \n", - "\n", - "[8 rows x 26 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'processed_train' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocessed_train\u001b[49m.describe()\n", + "\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined" + ] } ], "source": [ "processed_train.describe()" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'processed_train' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msns\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m desc = \u001b[43mprocessed_train\u001b[49m.describe()\n\u001b[32m 6\u001b[39m plt.figure(figsize=(\u001b[32m12\u001b[39m, \u001b[32m6\u001b[39m))\n\u001b[32m 7\u001b[39m sns.barplot(data=desc.T[[\u001b[33m'\u001b[39m\u001b[33mmean\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mstd\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmin\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmax\u001b[39m\u001b[33m'\u001b[39m]])\n", + "\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "desc = processed_train.describe()\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "sns.barplot(data=desc.T[['mean', 'std', 'min', 'max']])\n", + "plt.title('Feature Statistics')\n", + "plt.xticks(rotation=45)\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -3248,12 +2601,12 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", - "from tensorflow.keras.layers import Dense, Dropout, BatchNormalization\n", + "from tensorflow.keras.layers import Dense\n", "\n", "def build_model(input_dim, learning_rate=0.001):\n", " \"\"\"\n",