Compare commits

..

12 Commits

Author SHA1 Message Date
8709a3cbc6 Upload files to "/" 2025-05-22 06:56:32 +00:00
ef6f0cb447 Delete Nam/scaler.pkl 2025-05-21 11:53:56 +00:00
113bf7a747 Delete Nam/scaler 1.pkl 2025-05-21 11:53:50 +00:00
52f20bfcea Upload files to "Nam" 2025-05-21 11:53:38 +00:00
66f9061c4a Upload files to "Nam" 2025-05-21 11:53:09 +00:00
8ccf9e8642 Upload files to "/" 2025-05-21 11:52:11 +00:00
58056ce8a4 Upload files to "Nam" 2025-05-21 11:03:24 +00:00
66f5c75d1e Delete Nam/Final_code 1.py 2025-05-21 10:41:24 +00:00
786ae98996 Upload files to "Nam" 2025-05-21 10:41:11 +00:00
41dbe60e9a 병남씨 다시해주세요 2025-04-30 06:34:41 +00:00
qudwns245
477fc5e159 Nam model 2025-04-30 15:28:50 +09:00
8de5238395 backend-junPreP Update(Update whitelist model), front-urlpredictor.jsx UI Update 2025-04-30 05:03:13 +00:00
34 changed files with 11802 additions and 24554 deletions

38
Nam/Feature.py Normal file
View File

@@ -0,0 +1,38 @@
import re
from collections import Counter
from scipy.stats import entropy
def calculate_url_entropy(url):
counter = Counter(url)
probabilities = [count / len(url) for count in counter.values()]
return entropy(probabilities, base=2)
def extract_url_features(url):
suspicious_words = [
'login', 'verify', 'update', 'confirm',
'account', 'secure', 'ebayisapi', 'banking'
]
return {
'digit_count': len(re.findall(r'\d', url)),
'dash_count': url.count('-'),
'underscore_count': url.count('_'),
'percent_count': url.count('%'),
'equal_count': url.count('='),
'question_count': url.count('?'),
'at_count': url.count('@'),
'count_of_exclamation': url.count('!'),
'count_of_dot': url.count('.'),
'count_of_double_slash': url.count('//'),
'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
'has_www': 'www' in url,
'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
'path_depth': url.count('/') - 2,
'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
'has_multiple_dash': bool(re.search(r'-{2,}', url)),
'has_https': url.startswith('https'),
'ends_with_common_extension': url.endswith(('.html', '.php')),
'url_length': len(url), # ✅ 추가
'url_entropy': calculate_url_entropy(url) # ✅ 추가
}

BIN
Nam/best_model 1.h5 Normal file

Binary file not shown.

54
Nam/model.running_code.py Normal file
View File

@@ -0,0 +1,54 @@
import pandas as pd
import pickle
from tensorflow.keras.models import load_model
from Feature import extract_url_features
from collections import Counter
from scipy.stats import entropy
import tensorflow as tf
# 🔹 URL 엔트로피 계산 함수
def calculate_url_entropy(url):
counter = Counter(url)
probabilities = [count / len(url) for count in counter.values()]
return entropy(probabilities, base=2)
# 🔹 스케일러 불러오기
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
# 🔹 모델 불러오기
model = load_model("best_model.h5")
# 🔹 예측 함수
@tf.function(reduce_retracing=True)
def predict_with_model(model, input_data):
return model(input_data)
# 🔹 입력 URL 받기
url = input("URL입력 : ")
# 🔹 Feature.py에서 피처 추출
features = extract_url_features(url)
# 🔹 누락된 피처 보완
features['url_length'] = len(url)
features['url_entropy'] = calculate_url_entropy(url)
# 🔹 데이터프레임 생성 및 정렬
input_df = pd.DataFrame([features])
expected_columns = list(scaler.feature_names_in_)
input_df = input_df[expected_columns]
# 🔹 스케일링
input_scaled = scaler.transform(input_df)
# 🔹 예측
prediction = predict_with_model(model, input_scaled)
score = float(prediction.numpy()[0][0]) # 🔥 정확히 float으로 변환
# 🔹 출력
threshold = 0.5
if score > threshold:
print(f"악성 (악성일 확률: {score:.4f})")
else:
print(f"정상 (정상일 확률: {1 - score:.4f})")

BIN
Nam/model.scaler.pkl Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -1,52 +1,53 @@
from app.junPreP import extract_features
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
import tensorflow as tf
import os
# 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "models", "Recall_0.77.keras")
SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl")
# 모델 및 스케일러 로드 (1회만 수행)
model = load_model(MODEL_PATH)
with open(SCALER_PATH, 'rb') as f:
scaler = pickle.load(f)
# @tf.function으로 추론 최적화
@tf.function(reduce_retracing=True)
def predict_with_model(model, input_data):
return model(input_data)
# Threshold (적절히 조정 가능)
BEST_THRESHOLD = 0.4034
# 📦 예측 함수 정의 (FastAPI에서 import해서 사용)
def predict_url_maliciousness(url: str) -> dict:
# 특성 추출
features = extract_features(url)
input_df = pd.DataFrame([list(features.values())], columns=features.keys())
# 스케일링
input_scaled = scaler.transform(input_df)
# 예측
prediction = predict_with_model(model, input_scaled)
malicious_prob = float(prediction[0][0])
# 임계값 기반 판단
is_malicious = bool(malicious_prob > BEST_THRESHOLD)
# Ensure all values are Python native types (not numpy types)
return {
"url": str(url),
"malicious_probability": float(malicious_prob),
"is_malicious": bool(is_malicious),
"threshold": float(BEST_THRESHOLD)
}
from app.junPreP import extract_features
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model
import tensorflow as tf
import os
# 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "models", "White_list_model.keras")
SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl")
# 모델 및 스케일러 로드 (1회만 수행)
model = load_model(MODEL_PATH)
with open(SCALER_PATH, 'rb') as f:
scaler = pickle.load(f)
# @tf.function으로 추론 최적화
@tf.function(reduce_retracing=True)
def predict_with_model(model, input_data):
return model(input_data)
# Threshold (적절히 조정 가능)
BEST_THRESHOLD = 0.4034
# 📦 예측 함수 정의 (FastAPI에서 import해서 사용)
def predict_url_maliciousness(url: str) -> dict:
# 특성 추출
features = extract_features(url)
input_df = pd.DataFrame([list(features.values())], columns=features.keys())
# 스케일링
input_scaled = scaler.transform(input_df)
# 예측
prediction = predict_with_model(model, input_scaled)
malicious_prob = float(prediction[0][0].numpy())
# 임계값 기반 판단
is_malicious = bool(malicious_prob > BEST_THRESHOLD)
# 예: malicious_probability가 np.float32 타입일 경우
return {
"url": str(url),
"malicious_probability": malicious_prob,
"is_malicious": is_malicious,
"threshold": float(BEST_THRESHOLD)
}

View File

@@ -1,204 +1,274 @@
import re
from urllib.parse import urlparse, parse_qs
import tldextract
import zlib
import re
from urllib.parse import urlparse
from collections import Counter
import math
def check_similar_brand(url):
# 자주 사용되는 브랜드/도메인 목록
common_brands = {
'google', 'facebook', 'amazon', 'microsoft', 'apple',
'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
'geocities', 'angelfire', 'newadvent', 'wikipedia',
}
# 2. 유사 브랜드 확인
try:
# URL 파싱
parsed = urlparse(url if '//' in url else '//' + url)
domain = parsed.netloc.lower() if parsed.netloc else url.lower()
for brand in common_brands:
if brand not in domain:
similar = False
# 비슷한 철자 패턴 확인
patterns = [
brand.replace('o', '0'),
brand.replace('i', '1'),
brand.replace('l', '1'),
brand.replace('e', '3'),
brand.replace('a', '4'),
brand.replace('s', '5'),
brand + '-',
brand + '_',
brand[:-1], # 마지막 문자 제거
''.join(c + c for c in brand), # 문자 중복
]
for pattern in patterns:
if pattern in domain:
similar = True
break
if similar:
return True # 유사 브랜드가 발견되면 True 반환
except Exception as e:
return False # 예외 발생 시 False 반환
return False # 유사 브랜드가 없으면 False 반환
# url 압축 비율 계산 함수
def compression_ratio(url: str) -> float:
if not url:
return 0.0
original_length = len(url.encode('utf-8'))
compressed_data = zlib.compress(url.encode('utf-8'))
compressed_length = len(compressed_data)
return compressed_length / original_length
def extract_features(url):
parsed_url = urlparse(url)
suspicious_keywords = [
'login', 'verify', 'account', 'update', 'secure', 'banking',
'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
'bonus', 'admin', 'support', 'server', 'password', 'click',
'urgent', 'immediate', 'alert', 'security', 'prompt'
]
additional_keywords = [
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
'validation', 'authenticate', 'reset', 'recover', 'access',
'limited', 'offer', 'prize', 'win', 'winner', 'payment',
'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
'unusual', 'activity', 'verify', 'document', 'invoice'
]
all_keywords = list(set(suspicious_keywords + additional_keywords))
contains_keyword = 0
keyword_count = 0
for keyword in all_keywords:
if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
contains_keyword = 1
keyword_count += 1
url_length = len(url)
extracted = tldextract.extract(url)
tld = extracted.suffix
domain = extracted.domain
subdomain = extracted.subdomain
tld_length = len(tld) if tld else 0
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
is_common_tld = 1 if tld in common_tlds else 0
country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
is_country_tld = 1 if tld in country_tlds else 0
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
is_suspicious_tld = 1 if tld in suspicious_tlds else 0
url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
full_domain = f"{domain}.{tld}" if tld else domain
is_shortened = 1 if full_domain in url_shorteners else 0
domain_length = len(domain) if domain else 0
has_subdomain = 1 if subdomain else 0
subdomain_length = len(subdomain) if subdomain else 0
subdomain_count = len(subdomain.split('.')) if subdomain else 0
path = parsed_url.path
path_length = len(path)
path_depth = path.count('/') if path else 0
query = parsed_url.query
has_query = 1 if query else 0
query_length = len(query) if query else 0
query_params = parse_qs(query)
query_param_count = len(query_params) if query_params else 0
has_fragment = 1 if parsed_url.fragment else 0
fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0
# Character type ratios
letter_count = sum(c.isalpha() for c in url)
digit_count = sum(c.isdigit() for c in url)
special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))
letter_ratio = letter_count / url_length if url_length > 0 else 0
digit_ratio = digit_count / url_length if url_length > 0 else 0
special_char_ratio = special_char_count / url_length if url_length > 0 else 0
# Character distribution and entropy
if url:
char_counts = Counter(url)
total_chars = len(url)
char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
else:
entropy = 0
if url_length <= 13:
url_length_cat = 0
elif url_length <= 18:
url_length_cat = 1
elif url_length <= 25:
url_length_cat = 2
else:
url_length_cat = 3
return {
# "url_length": url_length,
"url_length_cat": url_length_cat,
"num_dots": url.count("."),
"num_digits": sum(c.isdigit() for c in url),
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
"url_keyword": contains_keyword,
# "url_keyword_count": keyword_count,
"num_underbar": url.count("_"),
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
"upper": int(any(c.isupper() for c in url)),
"is_common_tld": is_common_tld,
"is country_tld": is_country_tld,
"is_suspicious_tld": is_suspicious_tld,
"domain_length": domain_length,
"has_subdomain": has_subdomain,
"subdomain_length": subdomain_length,
"subdomain_count": subdomain_count,
# "path_length": path_length,
"path_depth": path_depth,
"has_query": has_query,
"query_length": query_length,
"query_param_count": query_param_count,
# "has_fragment": has_fragment,
# "fragment_length": fragment_length,
"url_shorteners": is_shortened,
# 새로 추가된 특성
"compression_ratio": compression_ratio(url),
"check_similar_brand" : check_similar_brand(url),
# Advanced text analysis
"entropy": entropy,
#"letter_ratio": letter_ratio,
"digit_ratio": digit_ratio,
"special_char_ratio": special_char_ratio
}
import re
from urllib.parse import urlparse, parse_qs
import tldextract
import zlib
from collections import Counter
import math
def url_is_whitelisted(url):
trusted_domains = [
# 1. 포털 / 검색엔진
'naver.com', 'daum.net', 'google.com', 'bing.com', 'yahoo.com',
# 2. 소셜 미디어 / 커뮤니케이션
'facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'linkedin.com',
'whatsapp.com', 'kakao.com', 'kakaocorp.com',
# 3. 동영상 / 스트리밍
'youtube.com', 'netflix.com', 'twitch.tv', 'tving.com', 'watcha.com',
# 4. 쇼핑 / 이커머스
'amazon.com', 'gmarket.co.kr', '11st.co.kr', 'coupang.com', 'ssg.com', 'wemakeprice.com',
# 5. 금융 / 결제
'paypal.com', 'kbfg.com', 'shinhan.com', 'hanafn.com', 'wooribank.com',
'kakaobank.com', 'toss.im',
# 6. 공공기관 / 교육
'gov.kr', 'moe.go.kr', 'epeople.go.kr', 'pusan.ac.kr', 'ac.kr',
# 7. IT / 기술
'apple.com', 'microsoft.com', 'adobe.com', 'github.com', 'stackoverflow.com'
]
try:
domain = urlparse(url if '//' in url else '//' + url).netloc.lower()
for trusted in trusted_domains:
if domain.endswith(trusted):
return True
return False
except:
return False
def check_similar_brand(url):
# 자주 사용되는 브랜드/도메인 목록
common_brands = {
'google', 'facebook', 'amazon', 'microsoft', 'apple',
'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
'geocities', 'angelfire', 'newadvent', 'wikipedia',
}
# 2. 유사 브랜드 확인
try:
# URL 파싱
parsed = urlparse(url if '//' in url else '//' + url)
domain = parsed.netloc.lower() if parsed.netloc else url.lower()
for brand in common_brands:
if brand not in domain:
similar = False
# 비슷한 철자 패턴 확인
patterns = [
brand.replace('o', '0'),
brand.replace('i', '1'),
brand.replace('l', '1'),
brand.replace('e', '3'),
brand.replace('a', '4'),
brand.replace('s', '5'),
brand + '-',
brand + '_',
brand[:-1], # 마지막 문자 제거
''.join(c + c for c in brand), # 문자 중복
]
for pattern in patterns:
if pattern in domain:
similar = True
break
if similar:
return True # 유사 브랜드가 발견되면 True 반환
except Exception as e:
return False # 예외 발생 시 False 반환
return False # 유사 브랜드가 없으면 False 반환
# url 압축 비율 계산 함수
def compression_ratio(url: str) -> float:
if not url:
return 0.0
original_length = len(url.encode('utf-8'))
compressed_data = zlib.compress(url.encode('utf-8'))
compressed_length = len(compressed_data)
return compressed_length / original_length
def extract_features(url):
parsed_url = urlparse(url)
suspicious_keywords = [
'login', 'verify', 'account', 'update', 'secure', 'banking',
'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
'bonus', 'admin', 'support', 'server', 'password', 'click',
'urgent', 'immediate', 'alert', 'security', 'prompt'
]
additional_keywords = [
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
'validation', 'authenticate', 'reset', 'recover', 'access',
'limited', 'offer', 'prize', 'win', 'winner', 'payment',
'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
'unusual', 'activity', 'verify', 'document', 'invoice'
]
all_keywords = list(set(suspicious_keywords + additional_keywords))
contains_keyword = 0
keyword_count = 0
for keyword in all_keywords:
if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
contains_keyword = 1
keyword_count += 1
url_length = len(url)
extracted = tldextract.extract(url)
tld = extracted.suffix
domain = extracted.domain
subdomain = extracted.subdomain
tld_length = len(tld) if tld else 0
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
is_common_tld = 1 if tld in common_tlds else 0
country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
is_country_tld = 1 if tld in country_tlds else 0
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
is_suspicious_tld = 1 if tld in suspicious_tlds else 0
url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
full_domain = f"{domain}.{tld}" if tld else domain
is_shortened = 1 if full_domain in url_shorteners else 0
domain_length = len(domain) if domain else 0
has_subdomain = 1 if subdomain else 0
subdomain_length = len(subdomain) if subdomain else 0
subdomain_count = len(subdomain.split('.')) if subdomain else 0
path = parsed_url.path
path_length = len(path)
path_depth = path.count('/') if path else 0
query = parsed_url.query
has_query = 1 if query else 0
query_length = len(query) if query else 0
query_params = parse_qs(query)
query_param_count = len(query_params) if query_params else 0
has_fragment = 1 if parsed_url.fragment else 0
fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0
# Character type ratios
letter_count = sum(c.isalpha() for c in url)
digit_count = sum(c.isdigit() for c in url)
special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))
letter_ratio = letter_count / url_length if url_length > 0 else 0
digit_ratio = digit_count / url_length if url_length > 0 else 0
special_char_ratio = special_char_count / url_length if url_length > 0 else 0
# Character distribution and entropy
if url:
char_counts = Counter(url)
total_chars = len(url)
char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
else:
entropy = 0
if url_length <= 13:
url_length_cat = 0
elif url_length <= 18:
url_length_cat = 1
elif url_length <= 25:
url_length_cat = 2
else:
url_length_cat = 3
if url_is_whitelisted(url):
return {
# 화이트리스트 URL이면 특징값들을 "정상적"으로 처리되도록 설정
"url_length_cat": 1,
"num_dots": 1,
"num_digits": 0,
"num_special_chars": 1,
"url_keyword": 0,
"num_underbar": 0,
"extract_consecutive_numbers": 0,
"number": 0,
"upper": 0,
"is_common_tld": 1,
"is country_tld": 0,
"is_suspicious_tld": 0,
"domain_length": 5,
"has_subdomain": 0,
"subdomain_length": 0,
"subdomain_count": 0,
"path_depth": 0,
"has_query": 0,
"query_length": 0,
"query_param_count": 0,
"url_shorteners": 0,
"compression_ratio": 1.0,
"check_similar_brand": 0,
"entropy": 3.0,
"digit_ratio": 0.0,
"special_char_ratio": 0.1
}
return {
# "url_length": url_length,
"url_length_cat": url_length_cat,
"num_dots": url.count("."),
"num_digits": sum(c.isdigit() for c in url),
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
"url_keyword": contains_keyword,
# "url_keyword_count": keyword_count,
"num_underbar": url.count("_"),
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
"upper": int(any(c.isupper() for c in url)),
"is_common_tld": is_common_tld,
"is country_tld": is_country_tld,
"is_suspicious_tld": is_suspicious_tld,
"domain_length": domain_length,
"has_subdomain": has_subdomain,
"subdomain_length": subdomain_length,
"subdomain_count": subdomain_count,
# "path_length": path_length,
"path_depth": path_depth,
"has_query": has_query,
"query_length": query_length,
"query_param_count": query_param_count,
# "has_fragment": has_fragment,
# "fragment_length": fragment_length,
"url_shorteners": is_shortened,
# 새로 추가된 특성
"compression_ratio": compression_ratio(url),
"check_similar_brand" : check_similar_brand(url),
# Advanced text analysis
"entropy": entropy,
#"letter_ratio": letter_ratio,
"digit_ratio": digit_ratio,
"special_char_ratio": special_char_ratio
}

View File

@@ -2,7 +2,6 @@ from fastapi import FastAPI
from pydantic import BaseModel
from app.model_load import use_model # predictor.py에서 함수 import
from app.exe import predict_url_maliciousness
from app.utils import convert_numpy_to_python_types
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI()
@@ -28,13 +27,15 @@ def root():
def predict(request: UrlRequest):
url = request.url
result_model1 = convert_numpy_to_python_types(use_model(url))
result_model2 = convert_numpy_to_python_types(predict_url_maliciousness(url))
response_data = {
"url": url,
"model1": result_model1,
"model2": result_model2
}
return convert_numpy_to_python_types(response_data)
result_model1 = use_model(url)
result_model2 = predict_url_maliciousness(url)
# print("model1 : ")
# print(result_model1.values())
# print("model2 : ")
# print(result_model2.values())
return {
"url" : url,
"model1": result_model1,
"model2": result_model2
}

View File

@@ -29,6 +29,12 @@ def use_model(url : str):
input_data = featured_df[features_cols]
# 학습된 모델에 적용
model_pred = round(float(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load])), 4)
model_pred = round(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load]), 4)
return model_pred
#return model_pred
return {
"url" : url,
"malicious_probability" : float(model_pred),
"is_malicious" : bool(model_pred > best_threshold),
"threshold" : float(best_threshold)
}

View File

@@ -44,7 +44,7 @@ def predict_url(url: str) -> dict:
input_data = preprocessed[features_cols]
# ✅ 전처리된 데이터 확인
print("Preprocessed input:", input_data)
#print("Preprocessed input:", input_data)
# 평균 확률 계산
probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load]
@@ -61,8 +61,8 @@ def predict_url(url: str) -> dict:
# 예: malicious_probability가 np.float32 타입일 경우
return {
"url": url,
"malicious_probability": mean_pred, # ⬅️ numpy -> float
"is_malicious": bool(is_malicious), # ⬅️ numpy -> bool
"malicious_probability": mean_pred,
"is_malicious": is_malicious,
"threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float
}

4
backend/app/testexe.py Normal file
View File

@@ -0,0 +1,4 @@
from exe import predict_url_maliciousness
result_model2 = predict_url_maliciousness("www.naver.com")
print(result_model2)

View File

@@ -1,18 +0,0 @@
import numpy as np
def convert_numpy_to_python_types(obj):
"""
Recursively convert numpy types to native Python types.
"""
if isinstance(obj, np.ndarray):
return convert_numpy_to_python_types(obj.tolist())
elif isinstance(obj, np.number):
return float(obj) if isinstance(obj, np.floating) else int(obj)
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, dict):
return {k: convert_numpy_to_python_types(v) for k, v in obj.items()}
elif isinstance(obj, list) or isinstance(obj, tuple):
return [convert_numpy_to_python_types(item) for item in obj]
else:
return obj

BIN
best_model.h5 Normal file

Binary file not shown.

View File

@@ -2392,703 +2392,56 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
"columns": [
{
"name": "index",
"rawType": "object",
"type": "string"
},
{
"name": "label",
"rawType": "float64",
"type": "float"
},
{
"name": "url_length_cat",
"rawType": "float64",
"type": "float"
},
{
"name": "num_dots",
"rawType": "float64",
"type": "float"
},
{
"name": "num_digits",
"rawType": "float64",
"type": "float"
},
{
"name": "num_special_chars",
"rawType": "float64",
"type": "float"
},
{
"name": "url_keyword",
"rawType": "float64",
"type": "float"
},
{
"name": "num_underbar",
"rawType": "float64",
"type": "float"
},
{
"name": "extract_consecutive_numbers",
"rawType": "float64",
"type": "float"
},
{
"name": "number",
"rawType": "float64",
"type": "float"
},
{
"name": "upper",
"rawType": "float64",
"type": "float"
},
{
"name": "is_common_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "is_country_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "is_suspicious_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "domain_length",
"rawType": "float64",
"type": "float"
},
{
"name": "has_subdomain",
"rawType": "float64",
"type": "float"
},
{
"name": "subdomain_length",
"rawType": "float64",
"type": "float"
},
{
"name": "subdomain_count",
"rawType": "float64",
"type": "float"
},
{
"name": "path_depth",
"rawType": "float64",
"type": "float"
},
{
"name": "has_query",
"rawType": "float64",
"type": "float"
},
{
"name": "query_length",
"rawType": "float64",
"type": "float"
},
{
"name": "query_param_count",
"rawType": "float64",
"type": "float"
},
{
"name": "url_shorteners",
"rawType": "float64",
"type": "float"
},
{
"name": "compression_ratio",
"rawType": "float64",
"type": "float"
},
{
"name": "entropy",
"rawType": "float64",
"type": "float"
},
{
"name": "digit_ratio",
"rawType": "float64",
"type": "float"
},
{
"name": "special_char_ratio",
"rawType": "float64",
"type": "float"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "c79a077e-8e52-4e42-b88f-dc9698b0fa30",
"rows": [
[
"count",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0"
],
[
"mean",
"0.22371472079708868",
"1.4435534183000107",
"1.546944584861079",
"1.6343590387267808",
"2.6635716711917676",
"0.0370789025849114",
"0.045005501028154746",
"0.056463736673444787",
"0.08128040719044995",
"0.0357764112252997",
"0.6133649251700057",
"0.12739140329970197",
"0.022784949827420967",
"10.464007150192936",
"0.21130266862767075",
"2.43731000866898",
"0.2660177416735477",
"0.6056849294701858",
"0.027221368921135157",
"1.9155892390282507",
"0.04228915393958247",
"0.0018421582329004942",
"1.4552534994784176",
"3.5360434022769756",
"0.029042428345387533",
"0.1102289088601276"
],
[
"std",
"0.41673309122602675",
"1.1161203432813147",
"1.010078604927829",
"9.827940363271033",
"7.1618457272654",
"0.18895518694176003",
"0.6023702991784359",
"0.23081505741717664",
"0.273265280035072",
"0.18573223887275842",
"0.4869788780260291",
"0.33341093196934307",
"0.14921728811320575",
"5.0652546813544035",
"0.4082326232468674",
"6.90096602515224",
"0.6272395647222854",
"1.6003209664806863",
"0.1627279010519657",
"19.702068343354906",
"0.35208851309719974",
"0.04288082262284407",
"0.24856536988340924",
"0.47898938276414027",
"0.08255957016074264",
"0.046338026902092454"
],
[
"min",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.010181818181818183",
"-0.0",
"0.0",
"0.0"
],
[
"25%",
"0.0",
"0.0",
"1.0",
"0.0",
"1.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"7.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.3076923076923077",
"3.238901256602631",
"0.0",
"0.07142857142857142"
],
[
"50%",
"0.0",
"1.0",
"1.0",
"0.0",
"2.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"10.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.4444444444444444",
"3.5068905956085183",
"0.0",
"0.10344827586206896"
],
[
"75%",
"0.0",
"2.0",
"2.0",
"0.0",
"3.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"13.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.6153846153846154",
"3.7962176025900556",
"0.0",
"0.14285714285714285"
],
[
"max",
"1.0",
"3.0",
"171.0",
"2011.0",
"8198.0",
"1.0",
"136.0",
"1.0",
"1.0",
"1.0",
"1.0",
"1.0",
"1.0",
"63.0",
"1.0",
"237.0",
"38.0",
"136.0",
"1.0",
"8367.0",
"131.0",
"1.0",
"5.0",
"6.570554108088201",
"0.9545454545454546",
"1.0"
]
],
"shape": {
"columns": 26,
"rows": 8
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>url_length_cat</th>\n",
" <th>num_dots</th>\n",
" <th>num_digits</th>\n",
" <th>num_special_chars</th>\n",
" <th>url_keyword</th>\n",
" <th>num_underbar</th>\n",
" <th>extract_consecutive_numbers</th>\n",
" <th>number</th>\n",
" <th>upper</th>\n",
" <th>...</th>\n",
" <th>subdomain_count</th>\n",
" <th>path_depth</th>\n",
" <th>has_query</th>\n",
" <th>query_length</th>\n",
" <th>query_param_count</th>\n",
" <th>url_shorteners</th>\n",
" <th>compression_ratio</th>\n",
" <th>entropy</th>\n",
" <th>digit_ratio</th>\n",
" <th>special_char_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>...</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2.237147e-01</td>\n",
" <td>1.443553e+00</td>\n",
" <td>1.546945e+00</td>\n",
" <td>1.634359e+00</td>\n",
" <td>2.663572e+00</td>\n",
" <td>3.707890e-02</td>\n",
" <td>4.500550e-02</td>\n",
" <td>5.646374e-02</td>\n",
" <td>8.128041e-02</td>\n",
" <td>3.577641e-02</td>\n",
" <td>...</td>\n",
" <td>2.660177e-01</td>\n",
" <td>6.056849e-01</td>\n",
" <td>2.722137e-02</td>\n",
" <td>1.915589e+00</td>\n",
" <td>4.228915e-02</td>\n",
" <td>1.842158e-03</td>\n",
" <td>1.455253e+00</td>\n",
" <td>3.536043e+00</td>\n",
" <td>2.904243e-02</td>\n",
" <td>1.102289e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>4.167331e-01</td>\n",
" <td>1.116120e+00</td>\n",
" <td>1.010079e+00</td>\n",
" <td>9.827940e+00</td>\n",
" <td>7.161846e+00</td>\n",
" <td>1.889552e-01</td>\n",
" <td>6.023703e-01</td>\n",
" <td>2.308151e-01</td>\n",
" <td>2.732653e-01</td>\n",
" <td>1.857322e-01</td>\n",
" <td>...</td>\n",
" <td>6.272396e-01</td>\n",
" <td>1.600321e+00</td>\n",
" <td>1.627279e-01</td>\n",
" <td>1.970207e+01</td>\n",
" <td>3.520885e-01</td>\n",
" <td>4.288082e-02</td>\n",
" <td>2.485654e-01</td>\n",
" <td>4.789894e-01</td>\n",
" <td>8.255957e-02</td>\n",
" <td>4.633803e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.018182e-02</td>\n",
" <td>-0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.307692e+00</td>\n",
" <td>3.238901e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>7.142857e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.444444e+00</td>\n",
" <td>3.506891e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.034483e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>3.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.615385e+00</td>\n",
" <td>3.796218e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.428571e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000e+00</td>\n",
" <td>3.000000e+00</td>\n",
" <td>1.710000e+02</td>\n",
" <td>2.011000e+03</td>\n",
" <td>8.198000e+03</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.360000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>...</td>\n",
" <td>3.800000e+01</td>\n",
" <td>1.360000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>8.367000e+03</td>\n",
" <td>1.310000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>5.000000e+00</td>\n",
" <td>6.570554e+00</td>\n",
" <td>9.545455e-01</td>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" label url_length_cat num_dots num_digits \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.237147e-01 1.443553e+00 1.546945e+00 1.634359e+00 \n",
"std 4.167331e-01 1.116120e+00 1.010079e+00 9.827940e+00 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n",
"50% 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 \n",
"75% 0.000000e+00 2.000000e+00 2.000000e+00 0.000000e+00 \n",
"max 1.000000e+00 3.000000e+00 1.710000e+02 2.011000e+03 \n",
"\n",
" num_special_chars url_keyword num_underbar \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.663572e+00 3.707890e-02 4.500550e-02 \n",
"std 7.161846e+00 1.889552e-01 6.023703e-01 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 1.000000e+00 0.000000e+00 0.000000e+00 \n",
"50% 2.000000e+00 0.000000e+00 0.000000e+00 \n",
"75% 3.000000e+00 0.000000e+00 0.000000e+00 \n",
"max 8.198000e+03 1.000000e+00 1.360000e+02 \n",
"\n",
" extract_consecutive_numbers number upper ... \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 ... \n",
"mean 5.646374e-02 8.128041e-02 3.577641e-02 ... \n",
"std 2.308151e-01 2.732653e-01 1.857322e-01 ... \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"25% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"50% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"75% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"max 1.000000e+00 1.000000e+00 1.000000e+00 ... \n",
"\n",
" subdomain_count path_depth has_query query_length \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.660177e-01 6.056849e-01 2.722137e-02 1.915589e+00 \n",
"std 6.272396e-01 1.600321e+00 1.627279e-01 1.970207e+01 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"75% 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 \n",
"max 3.800000e+01 1.360000e+02 1.000000e+00 8.367000e+03 \n",
"\n",
" query_param_count url_shorteners compression_ratio entropy \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 4.228915e-02 1.842158e-03 1.455253e+00 3.536043e+00 \n",
"std 3.520885e-01 4.288082e-02 2.485654e-01 4.789894e-01 \n",
"min 0.000000e+00 0.000000e+00 1.018182e-02 -0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 1.307692e+00 3.238901e+00 \n",
"50% 0.000000e+00 0.000000e+00 1.444444e+00 3.506891e+00 \n",
"75% 0.000000e+00 0.000000e+00 1.615385e+00 3.796218e+00 \n",
"max 1.310000e+02 1.000000e+00 5.000000e+00 6.570554e+00 \n",
"\n",
" digit_ratio special_char_ratio \n",
"count 6.995056e+06 6.995056e+06 \n",
"mean 2.904243e-02 1.102289e-01 \n",
"std 8.255957e-02 4.633803e-02 \n",
"min 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 7.142857e-02 \n",
"50% 0.000000e+00 1.034483e-01 \n",
"75% 0.000000e+00 1.428571e-01 \n",
"max 9.545455e-01 1.000000e+00 \n",
"\n",
"[8 rows x 26 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
"ename": "NameError",
"evalue": "name 'processed_train' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocessed_train\u001b[49m.describe()\n",
"\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
]
}
],
"source": [
"processed_train.describe()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'processed_train' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msns\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m desc = \u001b[43mprocessed_train\u001b[49m.describe()\n\u001b[32m 6\u001b[39m plt.figure(figsize=(\u001b[32m12\u001b[39m, \u001b[32m6\u001b[39m))\n\u001b[32m 7\u001b[39m sns.barplot(data=desc.T[[\u001b[33m'\u001b[39m\u001b[33mmean\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mstd\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmin\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmax\u001b[39m\u001b[33m'\u001b[39m]])\n",
"\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"desc = processed_train.describe()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"sns.barplot(data=desc.T[['mean', 'std', 'min', 'max']])\n",
"plt.title('Feature Statistics')\n",
"plt.xticks(rotation=45)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
@@ -3248,12 +2601,12 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.layers import Dense, Dropout, BatchNormalization\n",
"from tensorflow.keras.layers import Dense\n",
"\n",
"def build_model(input_dim, learning_rate=0.001):\n",
" \"\"\"\n",

File diff suppressed because it is too large Load Diff

View File

@@ -12,10 +12,11 @@
"@types/react": "^19.1.0",
"@types/react-dom": "^19.1.1",
"axios": "^1.8.4",
"framer-motion": "^12.9.2",
"react": "^19.1.0",
"react-dom": "^19.1.0",
"react-icons": "^5.5.0",
"react-scripts": "^3.0.1",
"react-scripts": "^5.0.1",
"web-vitals": "^2.1.4"
},
"scripts": {
@@ -43,9 +44,11 @@
]
},
"devDependencies": {
"@babel/preset-react": "^7.26.3",
"autoprefixer": "^10.4.21",
"eslint": "^8.57.1",
"postcss": "^8.5.3",
"tailwindcss": "^3.3.5",
"typescript": "^5.3.3"
"typescript": "^4.1.2"
}
}

View File

@@ -3,7 +3,7 @@
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>URL 악성 판별기</title>
<title></title>
</head>
<body>
<div id="root"></div>

View File

@@ -5,9 +5,8 @@ import './App.css';
function App() {
return (
<div className="min-h-screen bg-gray-100 flex flex-col justify-center">
<div className="min-h-screen bg-sky-200 flex flex-col justify-center">
<div className="container mx-auto px-4 text-center">
<h1 className="text-3xl font-bold text-blue-600 mb-6">🔍 악성 URL 판별기</h1>
<UrlPredictor />
</div>
</div>

View File

@@ -1,5 +1,7 @@
import React, { useState } from "react";
import axios from "axios";
import { motion } from "framer-motion"; // 애니메이션용
import { FaSearch, FaRedo } from "react-icons/fa"; // 아이콘용
const UrlPredictor = () => {
const [url, setUrl] = useState("");
@@ -26,81 +28,90 @@ const UrlPredictor = () => {
}
};
// 모델 정보 정의 (title + 키)
const models = [
{ key: "old_model", title: "🧠 기존 모델 (Ho)" },
{ key: "new_model", title: "🚀 개선 모델 (Jun)" },
{ key: "model1", title: "HO 모델" },
{ key: "model2", title: "Jun 모델" },
];
return (
<div className="min-h-screen bg-gray-100 p-6">
{!results ? (
<div className="flex justify-center items-center h-full">
<form onSubmit={handleSubmit} className="flex gap-4 w-full max-w-2xl">
<div className="min-h-screen bg-blue-50 p-8">
<div className="grid grid-cols-1 md:grid-cols-2 gap-8 h-full">
{/* 왼쪽 입력창 */}
<div className="flex flex-col justify-center items-center gap-6">
<h1 className="text-2x1 font-bold text-blue-700">URL 판별기</h1>
<form onSubmit={handleSubmit} className="flex gap-2 w-full max-w-md">
<input
type="text"
value={url}
onChange={(e) => setUrl(e.target.value)}
placeholder="URL을 입력하세요"
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow"
className="flex-grow px-4 py-2 border border-gray-300 rounded-lg shadow-md focus:outline-none focus:ring-2 focus:ring-blue-400"
required
/>
<button
type="submit"
className="bg-blue-600 text-white px-6 py-2 rounded shadow hover:bg-blue-700 transition"
className="bg-blue-600 text-white px-6 py-2 flex items-center gap-2 rounded-lg shadow-md hover:bg-blue-700 transition"
>
검사하기
<FaSearch /> 검사
</button>
</form>
{loading && (
<div className="flex items-center gap-2">
<div className="w-6 h-6 border-4 border-blue-400 border-t-transparent rounded-full animate-spin"></div>
<p className="text-blue-600 font-semibold">분석 ...</p>
</div>
)}
{error && <p className="text-red-500"> {error}</p>}
</div>
) : (
<div className="grid grid-cols-2 gap-6">
{/* 좌측 입력창 */}
<div className="flex flex-col gap-4">
<form onSubmit={handleSubmit} className="flex gap-2">
<input
type="text"
value={url}
onChange={(e) => setUrl(e.target.value)}
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow"
placeholder="URL을 다시 입력해보세요"
required
/>
<button
type="submit"
className="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 transition"
>
다시 검사
</button>
</form>
{loading && <p>🔍 분석 ...</p>}
{error && <p className="text-red-500"> {error}</p>}
</div>
{/* 우측 결과 반복 렌더링 */}
<div className="flex flex-col gap-4">
{models.map((model) => {
{/* 오른쪽 결과창 */}
<div className="flex flex-col gap-6">
{results ? (
models.map((model) => {
const data = results[model.key];
if (!data) return null;
return (
<div key={model.key} className="bg-white rounded p-4 shadow">
<h2 className="text-lg font-bold mb-2">{model.title}</h2>
<p>
악성 확률: <strong>{(data.prob * 100).toFixed(2)}%</strong>
<motion.div
key={model.key}
initial={{ opacity: 0, y: 30 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.6 }}
className="bg-white rounded-2xl p-6 shadow-lg border border-gray-200"
>
<h2 className="text-xl font-bold mb-4 text-gray-800">{model.title}</h2>
<p className="mb-2 text-gray-700">
악성 확률:{" "}
<strong>
{(data.malicious_probability * 100).toFixed(2)}%
</strong>
</p>
<p>
판별 결과:{" "}
<strong className={data.malicious ? "text-red-600" : "text-green-600"}>
{data.malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"}
<strong
className={
data.is_malicious
? "text-red-600"
: "text-green-600"
}
>
{data.is_malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"}
</strong>
</p>
</div>
</motion.div>
);
})}
</div>
})
) : (
<div className="text-gray-500 flex items-center justify-center h-full">
결과가 여기에 표시됩니다.
</div>
)}
</div>
)}
</div>
</div>
);
};
export default UrlPredictor;
export default UrlPredictor;

View File

@@ -0,0 +1,26 @@
{
"compilerOptions": {
"target": "es5",
"lib": [
"dom",
"dom.iterable",
"esnext"
],
"allowJs": true,
"skipLibCheck": true,
"esModuleInterop": true,
"allowSyntheticDefaultImports": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noFallthroughCasesInSwitch": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "preserve"
},
"include": [
"src"
]
}

BIN
scaler.pkl Normal file

Binary file not shown.