From 58056ce8a424024877bc8ac41074a60551d0d901 Mon Sep 17 00:00:00 2001 From: hwangtaehyeon Date: Wed, 21 May 2025 11:03:24 +0000 Subject: [PATCH] Upload files to "Nam" --- Nam/Feature.py | 47 +++++++++++++++++++++++---------------- Nam/model.running_code.py | 41 +++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Nam/Feature.py b/Nam/Feature.py index e7634d6..700ac94 100644 --- a/Nam/Feature.py +++ b/Nam/Feature.py @@ -1,4 +1,11 @@ import re +from collections import Counter +from scipy.stats import entropy + +def calculate_url_entropy(url): + counter = Counter(url) + probabilities = [count / len(url) for count in counter.values()] + return entropy(probabilities, base=2) def extract_url_features(url): suspicious_words = [ @@ -7,23 +14,25 @@ def extract_url_features(url): ] return { - 'digit_count' : len(re.findall(r'\d', url)), - 'dash_count' : url.count('-'), - 'underscore_count' : url.count('_'), - 'percent_count' : url.count('%'), - 'equal_count' : url.count('='), - 'question_count' : url.count('?'), - 'at_count' : url.count('@'), - 'count_of_exclamation' : url.count('!'), - 'count_of_dot' : url.count('.'), - 'count_of_double_slash' : url.count('//'), - 'special_char_count' : len(re.findall(r'[^a-zA-Z0-9]', url)), - 'is_ip_in_url' : bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)), - 'has_www' : 'www' in url, - 'suspicious_word_count' : sum(word in url.lower() for word in suspicious_words), - 'path_depth' : url.count('/') - 2, - 'has_long_digit_sequence' : bool(re.search(r'\d{4,}', url)), - 'has_multiple_dash' : bool(re.search(r'-{2,}', url)), - 'has_https' : url.startswith('https'), - 'ends_with_common_extension' : url.endswith(('.html', '.php')) + 'digit_count': len(re.findall(r'\d', url)), + 'dash_count': url.count('-'), + 'underscore_count': url.count('_'), + 'percent_count': url.count('%'), + 'equal_count': url.count('='), + 'question_count': url.count('?'), + 'at_count': url.count('@'), + 'count_of_exclamation': url.count('!'), + 'count_of_dot': url.count('.'), + 'count_of_double_slash': url.count('//'), + 'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)), + 'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)), + 'has_www': 'www' in url, + 'suspicious_word_count': sum(word in url.lower() for word in suspicious_words), + 'path_depth': url.count('/') - 2, + 'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)), + 'has_multiple_dash': bool(re.search(r'-{2,}', url)), + 'has_https': url.startswith('https'), + 'ends_with_common_extension': url.endswith(('.html', '.php')), + 'url_length': len(url), # ✅ 추가 + 'url_entropy': calculate_url_entropy(url) # ✅ 추가 } diff --git a/Nam/model.running_code.py b/Nam/model.running_code.py index ca3a437..f4ee6a4 100644 --- a/Nam/model.running_code.py +++ b/Nam/model.running_code.py @@ -1,39 +1,54 @@ - import pandas as pd import pickle from tensorflow.keras.models import load_model -from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from Feature import extract_url_features +from collections import Counter +from scipy.stats import entropy import tensorflow as tf +# 🔹 URL 엔트로피 계산 함수 +def calculate_url_entropy(url): + counter = Counter(url) + probabilities = [count / len(url) for count in counter.values()] + return entropy(probabilities, base=2) -# 4. 스케일러 불러오기 +# 🔹 스케일러 불러오기 with open("scaler.pkl", "rb") as f: scaler = pickle.load(f) -# 5. 모델 불러오기 +# 🔹 모델 불러오기 model = load_model("best_model.h5") +# 🔹 예측 함수 @tf.function(reduce_retracing=True) def predict_with_model(model, input_data): return model(input_data) - +# 🔹 입력 URL 받기 url = input("URL입력 : ") +# 🔹 Feature.py에서 피처 추출 features = extract_url_features(url) -input_df = pd.DataFrame([list(features.values())], columns= features.keys()) +# 🔹 누락된 피처 보완 +features['url_length'] = len(url) +features['url_entropy'] = calculate_url_entropy(url) +# 🔹 데이터프레임 생성 및 정렬 +input_df = pd.DataFrame([features]) +expected_columns = list(scaler.feature_names_in_) +input_df = input_df[expected_columns] + +# 🔹 스케일링 input_scaled = scaler.transform(input_df) - +# 🔹 예측 prediction = predict_with_model(model, input_scaled) +score = float(prediction.numpy()[0][0]) # 🔥 정확히 float으로 변환 - -# 7. 결과 출력 -best_threshold = 0.5 -if prediction[0][0] > best_threshold: - print('악') +# 🔹 출력 +threshold = 0.5 +if score > threshold: + print(f"악성 (악성일 확률: {score:.4f})") else: - print('정') \ No newline at end of file + print(f"정상 (정상일 확률: {1 - score:.4f})")