From 58056ce8a424024877bc8ac41074a60551d0d901 Mon Sep 17 00:00:00 2001
From: hwangtaehyeon <st7276@koseongnam.com>
Date: Wed, 21 May 2025 11:03:24 +0000
Subject: [PATCH] Upload files to "Nam"

---
 Nam/Feature.py            | 47 +++++++++++++++++++++++----------------
 Nam/model.running_code.py | 41 +++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/Nam/Feature.py b/Nam/Feature.py
index e7634d6..700ac94 100644
--- a/Nam/Feature.py
+++ b/Nam/Feature.py
@@ -1,4 +1,11 @@
 import re
+from collections import Counter
+from scipy.stats import entropy
+
+def calculate_url_entropy(url):
+    counter = Counter(url)
+    probabilities = [count / len(url) for count in counter.values()]
+    return entropy(probabilities, base=2)
 
 def extract_url_features(url):
     suspicious_words = [
@@ -7,23 +14,25 @@ def extract_url_features(url):
     ]
 
     return {
-        'digit_count' : len(re.findall(r'\d', url)), 
-        'dash_count' : url.count('-'),
-        'underscore_count' : url.count('_'),
-        'percent_count' : url.count('%'),
-        'equal_count' : url.count('='),
-        'question_count' : url.count('?'),
-        'at_count' : url.count('@'),
-        'count_of_exclamation' : url.count('!'),
-        'count_of_dot' : url.count('.'),
-        'count_of_double_slash' : url.count('//'),
-        'special_char_count' : len(re.findall(r'[^a-zA-Z0-9]', url)),
-        'is_ip_in_url' : bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
-        'has_www' : 'www' in url,
-        'suspicious_word_count' : sum(word in url.lower() for word in suspicious_words),
-        'path_depth' : url.count('/') - 2,
-        'has_long_digit_sequence' : bool(re.search(r'\d{4,}', url)),
-        'has_multiple_dash' : bool(re.search(r'-{2,}', url)),
-        'has_https' : url.startswith('https'),
-        'ends_with_common_extension' : url.endswith(('.html', '.php'))
+        'digit_count': len(re.findall(r'\d', url)),
+        'dash_count': url.count('-'),
+        'underscore_count': url.count('_'),
+        'percent_count': url.count('%'),
+        'equal_count': url.count('='),
+        'question_count': url.count('?'),
+        'at_count': url.count('@'),
+        'count_of_exclamation': url.count('!'),
+        'count_of_dot': url.count('.'),
+        'count_of_double_slash': url.count('//'),
+        'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
+        'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
+        'has_www': 'www' in url,
+        'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
+        'path_depth': url.count('/') - 2,
+        'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
+        'has_multiple_dash': bool(re.search(r'-{2,}', url)),
+        'has_https': url.startswith('https'),
+        'ends_with_common_extension': url.endswith(('.html', '.php')),
+        'url_length': len(url),  # ✅ 추가
+        'url_entropy': calculate_url_entropy(url)  # ✅ 추가
     }
diff --git a/Nam/model.running_code.py b/Nam/model.running_code.py
index ca3a437..f4ee6a4 100644
--- a/Nam/model.running_code.py
+++ b/Nam/model.running_code.py
@@ -1,39 +1,54 @@
-
 import pandas as pd
 import pickle
 from tensorflow.keras.models import load_model
-from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
 from Feature import extract_url_features
+from collections import Counter
+from scipy.stats import entropy
 import tensorflow as tf
 
+# 🔹 URL 엔트로피 계산 함수
+def calculate_url_entropy(url):
+    counter = Counter(url)
+    probabilities = [count / len(url) for count in counter.values()]
+    return entropy(probabilities, base=2)
 
-# 4. 스케일러 불러오기
+# 🔹 스케일러 불러오기
 with open("scaler.pkl", "rb") as f:
     scaler = pickle.load(f)
 
-# 5. 모델 불러오기
+# 🔹 모델 불러오기
 model = load_model("best_model.h5")
 
+# 🔹 예측 함수
 @tf.function(reduce_retracing=True)
 def predict_with_model(model, input_data):
     return model(input_data)
 
-
+# 🔹 입력 URL 받기
 url = input("URL입력 : ")
 
+# 🔹 Feature.py에서 피처 추출
 features = extract_url_features(url)
-input_df = pd.DataFrame([list(features.values())], columns= features.keys())
 
+# 🔹 누락된 피처 보완
+features['url_length'] = len(url)
+features['url_entropy'] = calculate_url_entropy(url)
 
+# 🔹 데이터프레임 생성 및 정렬
+input_df = pd.DataFrame([features])
+expected_columns = list(scaler.feature_names_in_)
+input_df = input_df[expected_columns]
+
+# 🔹 스케일링
 input_scaled = scaler.transform(input_df)
 
-
+# 🔹 예측
 prediction = predict_with_model(model, input_scaled)
+score = float(prediction.numpy()[0][0])  # 🔥 정확히 float으로 변환
 
-
-# 7. 결과 출력
-best_threshold = 0.5
-if prediction[0][0] > best_threshold:
-    print('악')
+# 🔹 출력
+threshold = 0.5
+if score > threshold:
+    print(f"악성 (악성일 확률: {score:.4f})")
 else:
-    print('정')
\ No newline at end of file
+    print(f"정상 (정상일 확률: {1 - score:.4f})")