Compare commits

...

14 Commits

36 changed files with 13597 additions and 24555 deletions

Submodule 1st-project deleted from 16f48f84a3

38
Nam/Feature.py Normal file
View File

@@ -0,0 +1,38 @@
import re
from collections import Counter
from scipy.stats import entropy
def calculate_url_entropy(url):
counter = Counter(url)
probabilities = [count / len(url) for count in counter.values()]
return entropy(probabilities, base=2)
def extract_url_features(url):
suspicious_words = [
'login', 'verify', 'update', 'confirm',
'account', 'secure', 'ebayisapi', 'banking'
]
return {
'digit_count': len(re.findall(r'\d', url)),
'dash_count': url.count('-'),
'underscore_count': url.count('_'),
'percent_count': url.count('%'),
'equal_count': url.count('='),
'question_count': url.count('?'),
'at_count': url.count('@'),
'count_of_exclamation': url.count('!'),
'count_of_dot': url.count('.'),
'count_of_double_slash': url.count('//'),
'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
'has_www': 'www' in url,
'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
'path_depth': url.count('/') - 2,
'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
'has_multiple_dash': bool(re.search(r'-{2,}', url)),
'has_https': url.startswith('https'),
'ends_with_common_extension': url.endswith(('.html', '.php')),
'url_length': len(url), # ✅ 추가
'url_entropy': calculate_url_entropy(url) # ✅ 추가
}

BIN
Nam/best_model 1.h5 Normal file

Binary file not shown.

54
Nam/model.running_code.py Normal file
View File

@@ -0,0 +1,54 @@
import pandas as pd
import pickle
from tensorflow.keras.models import load_model
from Feature import extract_url_features
from collections import Counter
from scipy.stats import entropy
import tensorflow as tf
# 🔹 URL 엔트로피 계산 함수
def calculate_url_entropy(url):
counter = Counter(url)
probabilities = [count / len(url) for count in counter.values()]
return entropy(probabilities, base=2)
# 🔹 스케일러 불러오기
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
# 🔹 모델 불러오기
model = load_model("best_model.h5")
# 🔹 예측 함수
@tf.function(reduce_retracing=True)
def predict_with_model(model, input_data):
return model(input_data)
# 🔹 입력 URL 받기
url = input("URL입력 : ")
# 🔹 Feature.py에서 피처 추출
features = extract_url_features(url)
# 🔹 누락된 피처 보완
features['url_length'] = len(url)
features['url_entropy'] = calculate_url_entropy(url)
# 🔹 데이터프레임 생성 및 정렬
input_df = pd.DataFrame([features])
expected_columns = list(scaler.feature_names_in_)
input_df = input_df[expected_columns]
# 🔹 스케일링
input_scaled = scaler.transform(input_df)
# 🔹 예측
prediction = predict_with_model(model, input_scaled)
score = float(prediction.numpy()[0][0]) # 🔥 정확히 float으로 변환
# 🔹 출력
threshold = 0.5
if score > threshold:
print(f"악성 (악성일 확률: {score:.4f})")
else:
print(f"정상 (정상일 확률: {1 - score:.4f})")

BIN
Nam/model.scaler.pkl Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@@ -1,52 +1,53 @@
from app.junPreP import extract_features from app.junPreP import extract_features
import numpy as np import numpy as np
import pickle import pickle
import pandas as pd import pandas as pd
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model from tensorflow.keras.models import load_model
import tensorflow as tf import tensorflow as tf
import os import os
# 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정) # 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정)
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_PATH = os.path.join(BASE_DIR, "models", "Recall_0.77.keras") MODEL_PATH = os.path.join(BASE_DIR, "models", "White_list_model.keras")
SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl") SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl")
# 모델 및 스케일러 로드 (1회만 수행) # 모델 및 스케일러 로드 (1회만 수행)
model = load_model(MODEL_PATH) model = load_model(MODEL_PATH)
with open(SCALER_PATH, 'rb') as f: with open(SCALER_PATH, 'rb') as f:
scaler = pickle.load(f) scaler = pickle.load(f)
# @tf.function으로 추론 최적화 # @tf.function으로 추론 최적화
@tf.function(reduce_retracing=True) @tf.function(reduce_retracing=True)
def predict_with_model(model, input_data): def predict_with_model(model, input_data):
return model(input_data) return model(input_data)
# Threshold (적절히 조정 가능) # Threshold (적절히 조정 가능)
BEST_THRESHOLD = 0.4034 BEST_THRESHOLD = 0.4034
# 📦 예측 함수 정의 (FastAPI에서 import해서 사용) # 📦 예측 함수 정의 (FastAPI에서 import해서 사용)
def predict_url_maliciousness(url: str) -> dict: def predict_url_maliciousness(url: str) -> dict:
# 특성 추출 # 특성 추출
features = extract_features(url) features = extract_features(url)
input_df = pd.DataFrame([list(features.values())], columns=features.keys()) input_df = pd.DataFrame([list(features.values())], columns=features.keys())
# 스케일링 # 스케일링
input_scaled = scaler.transform(input_df) input_scaled = scaler.transform(input_df)
# 예측 # 예측
prediction = predict_with_model(model, input_scaled) prediction = predict_with_model(model, input_scaled)
malicious_prob = float(prediction[0][0]) malicious_prob = float(prediction[0][0].numpy())
# 임계값 기반 판단
is_malicious = bool(malicious_prob > BEST_THRESHOLD) # 임계값 기반 판단
is_malicious = bool(malicious_prob > BEST_THRESHOLD)
# Ensure all values are Python native types (not numpy types)
return { # 예: malicious_probability가 np.float32 타입일 경우
"url": str(url), return {
"malicious_probability": float(malicious_prob), "url": str(url),
"is_malicious": bool(is_malicious), "malicious_probability": malicious_prob,
"threshold": float(BEST_THRESHOLD) "is_malicious": is_malicious,
} "threshold": float(BEST_THRESHOLD)
}

View File

@@ -1,204 +1,274 @@
import re import re
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
import tldextract import tldextract
import zlib import zlib
import re from collections import Counter
from urllib.parse import urlparse import math
from collections import Counter
import math def url_is_whitelisted(url):
trusted_domains = [
# 1. 포털 / 검색엔진
'naver.com', 'daum.net', 'google.com', 'bing.com', 'yahoo.com',
def check_similar_brand(url):
# 자주 사용되는 브랜드/도메인 목록 # 2. 소셜 미디어 / 커뮤니케이션
common_brands = { 'facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'linkedin.com',
'google', 'facebook', 'amazon', 'microsoft', 'apple', 'whatsapp.com', 'kakao.com', 'kakaocorp.com',
'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok', # 3. 동영상 / 스트리밍
'geocities', 'angelfire', 'newadvent', 'wikipedia', 'youtube.com', 'netflix.com', 'twitch.tv', 'tving.com', 'watcha.com',
}
# 4. 쇼핑 / 이커머스
# 2. 유사 브랜드 확인 'amazon.com', 'gmarket.co.kr', '11st.co.kr', 'coupang.com', 'ssg.com', 'wemakeprice.com',
try:
# URL 파싱 # 5. 금융 / 결제
parsed = urlparse(url if '//' in url else '//' + url) 'paypal.com', 'kbfg.com', 'shinhan.com', 'hanafn.com', 'wooribank.com',
domain = parsed.netloc.lower() if parsed.netloc else url.lower() 'kakaobank.com', 'toss.im',
for brand in common_brands: # 6. 공공기관 / 교육
if brand not in domain: 'gov.kr', 'moe.go.kr', 'epeople.go.kr', 'pusan.ac.kr', 'ac.kr',
similar = False
# 비슷한 철자 패턴 확인 # 7. IT / 기술
patterns = [ 'apple.com', 'microsoft.com', 'adobe.com', 'github.com', 'stackoverflow.com'
brand.replace('o', '0'), ]
brand.replace('i', '1'),
brand.replace('l', '1'), try:
brand.replace('e', '3'), domain = urlparse(url if '//' in url else '//' + url).netloc.lower()
brand.replace('a', '4'), for trusted in trusted_domains:
brand.replace('s', '5'), if domain.endswith(trusted):
brand + '-', return True
brand + '_', return False
brand[:-1], # 마지막 문자 제거 except:
''.join(c + c for c in brand), # 문자 중복 return False
]
for pattern in patterns:
if pattern in domain: def check_similar_brand(url):
similar = True # 자주 사용되는 브랜드/도메인 목록
break common_brands = {
'google', 'facebook', 'amazon', 'microsoft', 'apple',
if similar: 'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
return True # 유사 브랜드가 발견되면 True 반환 'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
'geocities', 'angelfire', 'newadvent', 'wikipedia',
except Exception as e: }
return False # 예외 발생 시 False 반환
# 2. 유사 브랜드 확인
return False # 유사 브랜드가 없으면 False 반환 try:
# URL 파싱
parsed = urlparse(url if '//' in url else '//' + url)
domain = parsed.netloc.lower() if parsed.netloc else url.lower()
# url 압축 비율 계산 함수
def compression_ratio(url: str) -> float: for brand in common_brands:
if not url: if brand not in domain:
return 0.0 similar = False
original_length = len(url.encode('utf-8')) # 비슷한 철자 패턴 확인
compressed_data = zlib.compress(url.encode('utf-8')) patterns = [
compressed_length = len(compressed_data) brand.replace('o', '0'),
return compressed_length / original_length brand.replace('i', '1'),
brand.replace('l', '1'),
brand.replace('e', '3'),
def extract_features(url): brand.replace('a', '4'),
parsed_url = urlparse(url) brand.replace('s', '5'),
suspicious_keywords = [ brand + '-',
'login', 'verify', 'account', 'update', 'secure', 'banking', brand + '_',
'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free', brand[:-1], # 마지막 문자 제거
'bonus', 'admin', 'support', 'server', 'password', 'click', ''.join(c + c for c in brand), # 문자 중복
'urgent', 'immediate', 'alert', 'security', 'prompt' ]
]
for pattern in patterns:
additional_keywords = [ if pattern in domain:
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum', similar = True
'validation', 'authenticate', 'reset', 'recover', 'access', break
'limited', 'offer', 'prize', 'win', 'winner', 'payment',
'bank', 'credit', 'debit', 'card', 'expire', 'suspension', if similar:
'unusual', 'activity', 'verify', 'document', 'invoice' return True # 유사 브랜드가 발견되면 True 반환
]
except Exception as e:
all_keywords = list(set(suspicious_keywords + additional_keywords)) return False # 예외 발생 시 False 반환
contains_keyword = 0 return False # 유사 브랜드가 없으면 False 반환
keyword_count = 0
for keyword in all_keywords:
if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
contains_keyword = 1 # url 압축 비율 계산 함수
keyword_count += 1 def compression_ratio(url: str) -> float:
if not url:
url_length = len(url) return 0.0
extracted = tldextract.extract(url) original_length = len(url.encode('utf-8'))
tld = extracted.suffix compressed_data = zlib.compress(url.encode('utf-8'))
domain = extracted.domain compressed_length = len(compressed_data)
subdomain = extracted.subdomain return compressed_length / original_length
tld_length = len(tld) if tld else 0
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz'] def extract_features(url):
is_common_tld = 1 if tld in common_tlds else 0 parsed_url = urlparse(url)
country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es'] suspicious_keywords = [
is_country_tld = 1 if tld in country_tlds else 0 'login', 'verify', 'account', 'update', 'secure', 'banking',
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit'] 'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
is_suspicious_tld = 1 if tld in suspicious_tlds else 0 'bonus', 'admin', 'support', 'server', 'password', 'click',
url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc'] 'urgent', 'immediate', 'alert', 'security', 'prompt'
full_domain = f"{domain}.{tld}" if tld else domain ]
is_shortened = 1 if full_domain in url_shorteners else 0
additional_keywords = [
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
domain_length = len(domain) if domain else 0 'validation', 'authenticate', 'reset', 'recover', 'access',
has_subdomain = 1 if subdomain else 0 'limited', 'offer', 'prize', 'win', 'winner', 'payment',
subdomain_length = len(subdomain) if subdomain else 0 'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
subdomain_count = len(subdomain.split('.')) if subdomain else 0 'unusual', 'activity', 'verify', 'document', 'invoice'
]
path = parsed_url.path
path_length = len(path) all_keywords = list(set(suspicious_keywords + additional_keywords))
path_depth = path.count('/') if path else 0
contains_keyword = 0
query = parsed_url.query keyword_count = 0
has_query = 1 if query else 0 for keyword in all_keywords:
query_length = len(query) if query else 0 if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
query_params = parse_qs(query) contains_keyword = 1
query_param_count = len(query_params) if query_params else 0 keyword_count += 1
has_fragment = 1 if parsed_url.fragment else 0 url_length = len(url)
fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0 extracted = tldextract.extract(url)
tld = extracted.suffix
# Character type ratios domain = extracted.domain
letter_count = sum(c.isalpha() for c in url) subdomain = extracted.subdomain
digit_count = sum(c.isdigit() for c in url)
special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url)) tld_length = len(tld) if tld else 0
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
letter_ratio = letter_count / url_length if url_length > 0 else 0 is_common_tld = 1 if tld in common_tlds else 0
digit_ratio = digit_count / url_length if url_length > 0 else 0 country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
special_char_ratio = special_char_count / url_length if url_length > 0 else 0 is_country_tld = 1 if tld in country_tlds else 0
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
# Character distribution and entropy is_suspicious_tld = 1 if tld in suspicious_tlds else 0
if url: url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
char_counts = Counter(url) full_domain = f"{domain}.{tld}" if tld else domain
total_chars = len(url) is_shortened = 1 if full_domain in url_shorteners else 0
char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
else: domain_length = len(domain) if domain else 0
entropy = 0 has_subdomain = 1 if subdomain else 0
subdomain_length = len(subdomain) if subdomain else 0
subdomain_count = len(subdomain.split('.')) if subdomain else 0
path = parsed_url.path
path_length = len(path)
if url_length <= 13: path_depth = path.count('/') if path else 0
url_length_cat = 0
elif url_length <= 18: query = parsed_url.query
url_length_cat = 1 has_query = 1 if query else 0
elif url_length <= 25: query_length = len(query) if query else 0
url_length_cat = 2 query_params = parse_qs(query)
else: query_param_count = len(query_params) if query_params else 0
url_length_cat = 3
has_fragment = 1 if parsed_url.fragment else 0
return { fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0
# "url_length": url_length,
"url_length_cat": url_length_cat, # Character type ratios
"num_dots": url.count("."), letter_count = sum(c.isalpha() for c in url)
"num_digits": sum(c.isdigit() for c in url), digit_count = sum(c.isdigit() for c in url)
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)), special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))
"url_keyword": contains_keyword,
# "url_keyword_count": keyword_count, letter_ratio = letter_count / url_length if url_length > 0 else 0
"num_underbar": url.count("_"), digit_ratio = digit_count / url_length if url_length > 0 else 0
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))), special_char_ratio = special_char_count / url_length if url_length > 0 else 0
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
"upper": int(any(c.isupper() for c in url)), # Character distribution and entropy
if url:
"is_common_tld": is_common_tld, char_counts = Counter(url)
"is country_tld": is_country_tld, total_chars = len(url)
"is_suspicious_tld": is_suspicious_tld, char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
"domain_length": domain_length, else:
"has_subdomain": has_subdomain, entropy = 0
"subdomain_length": subdomain_length,
"subdomain_count": subdomain_count,
# "path_length": path_length,
"path_depth": path_depth,
"has_query": has_query, if url_length <= 13:
"query_length": query_length, url_length_cat = 0
"query_param_count": query_param_count, elif url_length <= 18:
# "has_fragment": has_fragment, url_length_cat = 1
# "fragment_length": fragment_length, elif url_length <= 25:
"url_shorteners": is_shortened, url_length_cat = 2
else:
# 새로 추가된 특성 url_length_cat = 3
"compression_ratio": compression_ratio(url),
"check_similar_brand" : check_similar_brand(url), if url_is_whitelisted(url):
return {
# Advanced text analysis # 화이트리스트 URL이면 특징값들을 "정상적"으로 처리되도록 설정
"entropy": entropy, "url_length_cat": 1,
#"letter_ratio": letter_ratio, "num_dots": 1,
"digit_ratio": digit_ratio, "num_digits": 0,
"special_char_ratio": special_char_ratio "num_special_chars": 1,
"url_keyword": 0,
"num_underbar": 0,
} "extract_consecutive_numbers": 0,
"number": 0,
"upper": 0,
"is_common_tld": 1,
"is country_tld": 0,
"is_suspicious_tld": 0,
"domain_length": 5,
"has_subdomain": 0,
"subdomain_length": 0,
"subdomain_count": 0,
"path_depth": 0,
"has_query": 0,
"query_length": 0,
"query_param_count": 0,
"url_shorteners": 0,
"compression_ratio": 1.0,
"check_similar_brand": 0,
"entropy": 3.0,
"digit_ratio": 0.0,
"special_char_ratio": 0.1
}
return {
# "url_length": url_length,
"url_length_cat": url_length_cat,
"num_dots": url.count("."),
"num_digits": sum(c.isdigit() for c in url),
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
"url_keyword": contains_keyword,
# "url_keyword_count": keyword_count,
"num_underbar": url.count("_"),
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
"upper": int(any(c.isupper() for c in url)),
"is_common_tld": is_common_tld,
"is country_tld": is_country_tld,
"is_suspicious_tld": is_suspicious_tld,
"domain_length": domain_length,
"has_subdomain": has_subdomain,
"subdomain_length": subdomain_length,
"subdomain_count": subdomain_count,
# "path_length": path_length,
"path_depth": path_depth,
"has_query": has_query,
"query_length": query_length,
"query_param_count": query_param_count,
# "has_fragment": has_fragment,
# "fragment_length": fragment_length,
"url_shorteners": is_shortened,
# 새로 추가된 특성
"compression_ratio": compression_ratio(url),
"check_similar_brand" : check_similar_brand(url),
# Advanced text analysis
"entropy": entropy,
#"letter_ratio": letter_ratio,
"digit_ratio": digit_ratio,
"special_char_ratio": special_char_ratio
}

View File

@@ -2,7 +2,6 @@ from fastapi import FastAPI
from pydantic import BaseModel from pydantic import BaseModel
from app.model_load import use_model # predictor.py에서 함수 import from app.model_load import use_model # predictor.py에서 함수 import
from app.exe import predict_url_maliciousness from app.exe import predict_url_maliciousness
from app.utils import convert_numpy_to_python_types
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
app = FastAPI() app = FastAPI()
@@ -28,13 +27,15 @@ def root():
def predict(request: UrlRequest): def predict(request: UrlRequest):
url = request.url url = request.url
result_model1 = convert_numpy_to_python_types(use_model(url)) result_model1 = use_model(url)
result_model2 = convert_numpy_to_python_types(predict_url_maliciousness(url)) result_model2 = predict_url_maliciousness(url)
# print("model1 : ")
response_data = { # print(result_model1.values())
"url": url, # print("model2 : ")
"model1": result_model1, # print(result_model2.values())
"model2": result_model2
} return {
"url" : url,
return convert_numpy_to_python_types(response_data) "model1": result_model1,
"model2": result_model2
}

View File

@@ -29,6 +29,12 @@ def use_model(url : str):
input_data = featured_df[features_cols] input_data = featured_df[features_cols]
# 학습된 모델에 적용 # 학습된 모델에 적용
model_pred = round(float(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load])), 4) model_pred = round(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load]), 4)
return model_pred #return model_pred
return {
"url" : url,
"malicious_probability" : float(model_pred),
"is_malicious" : bool(model_pred > best_threshold),
"threshold" : float(best_threshold)
}

View File

@@ -44,7 +44,7 @@ def predict_url(url: str) -> dict:
input_data = preprocessed[features_cols] input_data = preprocessed[features_cols]
# ✅ 전처리된 데이터 확인 # ✅ 전처리된 데이터 확인
print("Preprocessed input:", input_data) #print("Preprocessed input:", input_data)
# 평균 확률 계산 # 평균 확률 계산
probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load] probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load]
@@ -61,8 +61,8 @@ def predict_url(url: str) -> dict:
# 예: malicious_probability가 np.float32 타입일 경우 # 예: malicious_probability가 np.float32 타입일 경우
return { return {
"url": url, "url": url,
"malicious_probability": mean_pred, # ⬅️ numpy -> float "malicious_probability": mean_pred,
"is_malicious": bool(is_malicious), # ⬅️ numpy -> bool "is_malicious": is_malicious,
"threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float "threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float
} }

4
backend/app/testexe.py Normal file
View File

@@ -0,0 +1,4 @@
from exe import predict_url_maliciousness
result_model2 = predict_url_maliciousness("www.naver.com")
print(result_model2)

View File

@@ -1,18 +0,0 @@
import numpy as np
def convert_numpy_to_python_types(obj):
"""
Recursively convert numpy types to native Python types.
"""
if isinstance(obj, np.ndarray):
return convert_numpy_to_python_types(obj.tolist())
elif isinstance(obj, np.number):
return float(obj) if isinstance(obj, np.floating) else int(obj)
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, dict):
return {k: convert_numpy_to_python_types(v) for k, v in obj.items()}
elif isinstance(obj, list) or isinstance(obj, tuple):
return [convert_numpy_to_python_types(item) for item in obj]
else:
return obj

BIN
best_model.h5 Normal file

Binary file not shown.

View File

@@ -2392,703 +2392,56 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "ename": "NameError",
"application/vnd.microsoft.datawrangler.viewer.v0+json": { "evalue": "name 'processed_train' is not defined",
"columns": [ "output_type": "error",
{ "traceback": [
"name": "index", "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"rawType": "object", "\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"type": "string" "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocessed_train\u001b[49m.describe()\n",
}, "\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
{ ]
"name": "label",
"rawType": "float64",
"type": "float"
},
{
"name": "url_length_cat",
"rawType": "float64",
"type": "float"
},
{
"name": "num_dots",
"rawType": "float64",
"type": "float"
},
{
"name": "num_digits",
"rawType": "float64",
"type": "float"
},
{
"name": "num_special_chars",
"rawType": "float64",
"type": "float"
},
{
"name": "url_keyword",
"rawType": "float64",
"type": "float"
},
{
"name": "num_underbar",
"rawType": "float64",
"type": "float"
},
{
"name": "extract_consecutive_numbers",
"rawType": "float64",
"type": "float"
},
{
"name": "number",
"rawType": "float64",
"type": "float"
},
{
"name": "upper",
"rawType": "float64",
"type": "float"
},
{
"name": "is_common_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "is_country_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "is_suspicious_tld",
"rawType": "float64",
"type": "float"
},
{
"name": "domain_length",
"rawType": "float64",
"type": "float"
},
{
"name": "has_subdomain",
"rawType": "float64",
"type": "float"
},
{
"name": "subdomain_length",
"rawType": "float64",
"type": "float"
},
{
"name": "subdomain_count",
"rawType": "float64",
"type": "float"
},
{
"name": "path_depth",
"rawType": "float64",
"type": "float"
},
{
"name": "has_query",
"rawType": "float64",
"type": "float"
},
{
"name": "query_length",
"rawType": "float64",
"type": "float"
},
{
"name": "query_param_count",
"rawType": "float64",
"type": "float"
},
{
"name": "url_shorteners",
"rawType": "float64",
"type": "float"
},
{
"name": "compression_ratio",
"rawType": "float64",
"type": "float"
},
{
"name": "entropy",
"rawType": "float64",
"type": "float"
},
{
"name": "digit_ratio",
"rawType": "float64",
"type": "float"
},
{
"name": "special_char_ratio",
"rawType": "float64",
"type": "float"
}
],
"conversionMethod": "pd.DataFrame",
"ref": "c79a077e-8e52-4e42-b88f-dc9698b0fa30",
"rows": [
[
"count",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0",
"6995056.0"
],
[
"mean",
"0.22371472079708868",
"1.4435534183000107",
"1.546944584861079",
"1.6343590387267808",
"2.6635716711917676",
"0.0370789025849114",
"0.045005501028154746",
"0.056463736673444787",
"0.08128040719044995",
"0.0357764112252997",
"0.6133649251700057",
"0.12739140329970197",
"0.022784949827420967",
"10.464007150192936",
"0.21130266862767075",
"2.43731000866898",
"0.2660177416735477",
"0.6056849294701858",
"0.027221368921135157",
"1.9155892390282507",
"0.04228915393958247",
"0.0018421582329004942",
"1.4552534994784176",
"3.5360434022769756",
"0.029042428345387533",
"0.1102289088601276"
],
[
"std",
"0.41673309122602675",
"1.1161203432813147",
"1.010078604927829",
"9.827940363271033",
"7.1618457272654",
"0.18895518694176003",
"0.6023702991784359",
"0.23081505741717664",
"0.273265280035072",
"0.18573223887275842",
"0.4869788780260291",
"0.33341093196934307",
"0.14921728811320575",
"5.0652546813544035",
"0.4082326232468674",
"6.90096602515224",
"0.6272395647222854",
"1.6003209664806863",
"0.1627279010519657",
"19.702068343354906",
"0.35208851309719974",
"0.04288082262284407",
"0.24856536988340924",
"0.47898938276414027",
"0.08255957016074264",
"0.046338026902092454"
],
[
"min",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.010181818181818183",
"-0.0",
"0.0",
"0.0"
],
[
"25%",
"0.0",
"0.0",
"1.0",
"0.0",
"1.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"7.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.3076923076923077",
"3.238901256602631",
"0.0",
"0.07142857142857142"
],
[
"50%",
"0.0",
"1.0",
"1.0",
"0.0",
"2.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"10.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.4444444444444444",
"3.5068905956085183",
"0.0",
"0.10344827586206896"
],
[
"75%",
"0.0",
"2.0",
"2.0",
"0.0",
"3.0",
"0.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"13.0",
"0.0",
"0.0",
"0.0",
"1.0",
"0.0",
"0.0",
"0.0",
"0.0",
"1.6153846153846154",
"3.7962176025900556",
"0.0",
"0.14285714285714285"
],
[
"max",
"1.0",
"3.0",
"171.0",
"2011.0",
"8198.0",
"1.0",
"136.0",
"1.0",
"1.0",
"1.0",
"1.0",
"1.0",
"1.0",
"63.0",
"1.0",
"237.0",
"38.0",
"136.0",
"1.0",
"8367.0",
"131.0",
"1.0",
"5.0",
"6.570554108088201",
"0.9545454545454546",
"1.0"
]
],
"shape": {
"columns": 26,
"rows": 8
}
},
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>url_length_cat</th>\n",
" <th>num_dots</th>\n",
" <th>num_digits</th>\n",
" <th>num_special_chars</th>\n",
" <th>url_keyword</th>\n",
" <th>num_underbar</th>\n",
" <th>extract_consecutive_numbers</th>\n",
" <th>number</th>\n",
" <th>upper</th>\n",
" <th>...</th>\n",
" <th>subdomain_count</th>\n",
" <th>path_depth</th>\n",
" <th>has_query</th>\n",
" <th>query_length</th>\n",
" <th>query_param_count</th>\n",
" <th>url_shorteners</th>\n",
" <th>compression_ratio</th>\n",
" <th>entropy</th>\n",
" <th>digit_ratio</th>\n",
" <th>special_char_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>...</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" <td>6.995056e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>2.237147e-01</td>\n",
" <td>1.443553e+00</td>\n",
" <td>1.546945e+00</td>\n",
" <td>1.634359e+00</td>\n",
" <td>2.663572e+00</td>\n",
" <td>3.707890e-02</td>\n",
" <td>4.500550e-02</td>\n",
" <td>5.646374e-02</td>\n",
" <td>8.128041e-02</td>\n",
" <td>3.577641e-02</td>\n",
" <td>...</td>\n",
" <td>2.660177e-01</td>\n",
" <td>6.056849e-01</td>\n",
" <td>2.722137e-02</td>\n",
" <td>1.915589e+00</td>\n",
" <td>4.228915e-02</td>\n",
" <td>1.842158e-03</td>\n",
" <td>1.455253e+00</td>\n",
" <td>3.536043e+00</td>\n",
" <td>2.904243e-02</td>\n",
" <td>1.102289e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>4.167331e-01</td>\n",
" <td>1.116120e+00</td>\n",
" <td>1.010079e+00</td>\n",
" <td>9.827940e+00</td>\n",
" <td>7.161846e+00</td>\n",
" <td>1.889552e-01</td>\n",
" <td>6.023703e-01</td>\n",
" <td>2.308151e-01</td>\n",
" <td>2.732653e-01</td>\n",
" <td>1.857322e-01</td>\n",
" <td>...</td>\n",
" <td>6.272396e-01</td>\n",
" <td>1.600321e+00</td>\n",
" <td>1.627279e-01</td>\n",
" <td>1.970207e+01</td>\n",
" <td>3.520885e-01</td>\n",
" <td>4.288082e-02</td>\n",
" <td>2.485654e-01</td>\n",
" <td>4.789894e-01</td>\n",
" <td>8.255957e-02</td>\n",
" <td>4.633803e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.018182e-02</td>\n",
" <td>-0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.307692e+00</td>\n",
" <td>3.238901e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>7.142857e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.444444e+00</td>\n",
" <td>3.506891e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.034483e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>0.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>3.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.615385e+00</td>\n",
" <td>3.796218e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.428571e-01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.000000e+00</td>\n",
" <td>3.000000e+00</td>\n",
" <td>1.710000e+02</td>\n",
" <td>2.011000e+03</td>\n",
" <td>8.198000e+03</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.360000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e+00</td>\n",
" <td>...</td>\n",
" <td>3.800000e+01</td>\n",
" <td>1.360000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>8.367000e+03</td>\n",
" <td>1.310000e+02</td>\n",
" <td>1.000000e+00</td>\n",
" <td>5.000000e+00</td>\n",
" <td>6.570554e+00</td>\n",
" <td>9.545455e-01</td>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" label url_length_cat num_dots num_digits \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.237147e-01 1.443553e+00 1.546945e+00 1.634359e+00 \n",
"std 4.167331e-01 1.116120e+00 1.010079e+00 9.827940e+00 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n",
"50% 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 \n",
"75% 0.000000e+00 2.000000e+00 2.000000e+00 0.000000e+00 \n",
"max 1.000000e+00 3.000000e+00 1.710000e+02 2.011000e+03 \n",
"\n",
" num_special_chars url_keyword num_underbar \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.663572e+00 3.707890e-02 4.500550e-02 \n",
"std 7.161846e+00 1.889552e-01 6.023703e-01 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 1.000000e+00 0.000000e+00 0.000000e+00 \n",
"50% 2.000000e+00 0.000000e+00 0.000000e+00 \n",
"75% 3.000000e+00 0.000000e+00 0.000000e+00 \n",
"max 8.198000e+03 1.000000e+00 1.360000e+02 \n",
"\n",
" extract_consecutive_numbers number upper ... \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 ... \n",
"mean 5.646374e-02 8.128041e-02 3.577641e-02 ... \n",
"std 2.308151e-01 2.732653e-01 1.857322e-01 ... \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"25% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"50% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"75% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
"max 1.000000e+00 1.000000e+00 1.000000e+00 ... \n",
"\n",
" subdomain_count path_depth has_query query_length \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 2.660177e-01 6.056849e-01 2.722137e-02 1.915589e+00 \n",
"std 6.272396e-01 1.600321e+00 1.627279e-01 1.970207e+01 \n",
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
"75% 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 \n",
"max 3.800000e+01 1.360000e+02 1.000000e+00 8.367000e+03 \n",
"\n",
" query_param_count url_shorteners compression_ratio entropy \\\n",
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
"mean 4.228915e-02 1.842158e-03 1.455253e+00 3.536043e+00 \n",
"std 3.520885e-01 4.288082e-02 2.485654e-01 4.789894e-01 \n",
"min 0.000000e+00 0.000000e+00 1.018182e-02 -0.000000e+00 \n",
"25% 0.000000e+00 0.000000e+00 1.307692e+00 3.238901e+00 \n",
"50% 0.000000e+00 0.000000e+00 1.444444e+00 3.506891e+00 \n",
"75% 0.000000e+00 0.000000e+00 1.615385e+00 3.796218e+00 \n",
"max 1.310000e+02 1.000000e+00 5.000000e+00 6.570554e+00 \n",
"\n",
" digit_ratio special_char_ratio \n",
"count 6.995056e+06 6.995056e+06 \n",
"mean 2.904243e-02 1.102289e-01 \n",
"std 8.255957e-02 4.633803e-02 \n",
"min 0.000000e+00 0.000000e+00 \n",
"25% 0.000000e+00 7.142857e-02 \n",
"50% 0.000000e+00 1.034483e-01 \n",
"75% 0.000000e+00 1.428571e-01 \n",
"max 9.545455e-01 1.000000e+00 \n",
"\n",
"[8 rows x 26 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"processed_train.describe()" "processed_train.describe()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'processed_train' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msns\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m desc = \u001b[43mprocessed_train\u001b[49m.describe()\n\u001b[32m 6\u001b[39m plt.figure(figsize=(\u001b[32m12\u001b[39m, \u001b[32m6\u001b[39m))\n\u001b[32m 7\u001b[39m sns.barplot(data=desc.T[[\u001b[33m'\u001b[39m\u001b[33mmean\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mstd\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmin\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmax\u001b[39m\u001b[33m'\u001b[39m]])\n",
"\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"desc = processed_train.describe()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"sns.barplot(data=desc.T[['mean', 'std', 'min', 'max']])\n",
"plt.title('Feature Statistics')\n",
"plt.xticks(rotation=45)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
@@ -3248,12 +2601,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import tensorflow as tf\n", "import tensorflow as tf\n",
"from tensorflow.keras.layers import Dense, Dropout, BatchNormalization\n", "from tensorflow.keras.layers import Dense\n",
"\n", "\n",
"def build_model(input_dim, learning_rate=0.001):\n", "def build_model(input_dim, learning_rate=0.001):\n",
" \"\"\"\n", " \"\"\"\n",

File diff suppressed because it is too large Load Diff

View File

@@ -12,10 +12,11 @@
"@types/react": "^19.1.0", "@types/react": "^19.1.0",
"@types/react-dom": "^19.1.1", "@types/react-dom": "^19.1.1",
"axios": "^1.8.4", "axios": "^1.8.4",
"framer-motion": "^12.9.2",
"react": "^19.1.0", "react": "^19.1.0",
"react-dom": "^19.1.0", "react-dom": "^19.1.0",
"react-icons": "^5.5.0", "react-icons": "^5.5.0",
"react-scripts": "^3.0.1", "react-scripts": "^5.0.1",
"web-vitals": "^2.1.4" "web-vitals": "^2.1.4"
}, },
"scripts": { "scripts": {
@@ -43,9 +44,11 @@
] ]
}, },
"devDependencies": { "devDependencies": {
"@babel/preset-react": "^7.26.3",
"autoprefixer": "^10.4.21", "autoprefixer": "^10.4.21",
"eslint": "^8.57.1",
"postcss": "^8.5.3", "postcss": "^8.5.3",
"tailwindcss": "^3.3.5", "tailwindcss": "^3.3.5",
"typescript": "^5.3.3" "typescript": "^4.1.2"
} }
} }

View File

@@ -3,7 +3,7 @@
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>URL 악성 판별기</title> <title></title>
</head> </head>
<body> <body>
<div id="root"></div> <div id="root"></div>

View File

@@ -5,9 +5,8 @@ import './App.css';
function App() { function App() {
return ( return (
<div className="min-h-screen bg-gray-100 flex flex-col justify-center"> <div className="min-h-screen bg-sky-200 flex flex-col justify-center">
<div className="container mx-auto px-4 text-center"> <div className="container mx-auto px-4 text-center">
<h1 className="text-3xl font-bold text-blue-600 mb-6">🔍 악성 URL 판별기</h1>
<UrlPredictor /> <UrlPredictor />
</div> </div>
</div> </div>

View File

@@ -1,5 +1,7 @@
import React, { useState } from "react"; import React, { useState } from "react";
import axios from "axios"; import axios from "axios";
import { motion } from "framer-motion"; // 애니메이션용
import { FaSearch, FaRedo } from "react-icons/fa"; // 아이콘용
const UrlPredictor = () => { const UrlPredictor = () => {
const [url, setUrl] = useState(""); const [url, setUrl] = useState("");
@@ -26,81 +28,90 @@ const UrlPredictor = () => {
} }
}; };
// 모델 정보 정의 (title + 키)
const models = [ const models = [
{ key: "old_model", title: "🧠 기존 모델 (Ho)" }, { key: "model1", title: "HO 모델" },
{ key: "new_model", title: "🚀 개선 모델 (Jun)" }, { key: "model2", title: "Jun 모델" },
]; ];
return ( return (
<div className="min-h-screen bg-gray-100 p-6"> <div className="min-h-screen bg-blue-50 p-8">
{!results ? ( <div className="grid grid-cols-1 md:grid-cols-2 gap-8 h-full">
<div className="flex justify-center items-center h-full">
<form onSubmit={handleSubmit} className="flex gap-4 w-full max-w-2xl"> {/* 왼쪽 입력창 */}
<div className="flex flex-col justify-center items-center gap-6">
<h1 className="text-2x1 font-bold text-blue-700">URL 판별기</h1>
<form onSubmit={handleSubmit} className="flex gap-2 w-full max-w-md">
<input <input
type="text" type="text"
value={url} value={url}
onChange={(e) => setUrl(e.target.value)} onChange={(e) => setUrl(e.target.value)}
placeholder="URL을 입력하세요" placeholder="URL을 입력하세요"
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow" className="flex-grow px-4 py-2 border border-gray-300 rounded-lg shadow-md focus:outline-none focus:ring-2 focus:ring-blue-400"
required required
/> />
<button <button
type="submit" type="submit"
className="bg-blue-600 text-white px-6 py-2 rounded shadow hover:bg-blue-700 transition" className="bg-blue-600 text-white px-6 py-2 flex items-center gap-2 rounded-lg shadow-md hover:bg-blue-700 transition"
> >
검사하기 <FaSearch /> 검사
</button> </button>
</form> </form>
{loading && (
<div className="flex items-center gap-2">
<div className="w-6 h-6 border-4 border-blue-400 border-t-transparent rounded-full animate-spin"></div>
<p className="text-blue-600 font-semibold">분석 ...</p>
</div>
)}
{error && <p className="text-red-500"> {error}</p>}
</div> </div>
) : (
<div className="grid grid-cols-2 gap-6">
{/* 좌측 입력창 */}
<div className="flex flex-col gap-4">
<form onSubmit={handleSubmit} className="flex gap-2">
<input
type="text"
value={url}
onChange={(e) => setUrl(e.target.value)}
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow"
placeholder="URL을 다시 입력해보세요"
required
/>
<button
type="submit"
className="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 transition"
>
다시 검사
</button>
</form>
{loading && <p>🔍 분석 ...</p>}
{error && <p className="text-red-500"> {error}</p>}
</div>
{/* 우측 결과 반복 렌더링 */} {/* 오른쪽 결과창 */}
<div className="flex flex-col gap-4"> <div className="flex flex-col gap-6">
{models.map((model) => { {results ? (
models.map((model) => {
const data = results[model.key]; const data = results[model.key];
if (!data) return null;
return ( return (
<div key={model.key} className="bg-white rounded p-4 shadow"> <motion.div
<h2 className="text-lg font-bold mb-2">{model.title}</h2> key={model.key}
<p> initial={{ opacity: 0, y: 30 }}
악성 확률: <strong>{(data.prob * 100).toFixed(2)}%</strong> animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.6 }}
className="bg-white rounded-2xl p-6 shadow-lg border border-gray-200"
>
<h2 className="text-xl font-bold mb-4 text-gray-800">{model.title}</h2>
<p className="mb-2 text-gray-700">
악성 확률:{" "}
<strong>
{(data.malicious_probability * 100).toFixed(2)}%
</strong>
</p> </p>
<p> <p>
판별 결과:{" "} 판별 결과:{" "}
<strong className={data.malicious ? "text-red-600" : "text-green-600"}> <strong
{data.malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"} className={
data.is_malicious
? "text-red-600"
: "text-green-600"
}
>
{data.is_malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"}
</strong> </strong>
</p> </p>
</div> </motion.div>
); );
})} })
</div> ) : (
<div className="text-gray-500 flex items-center justify-center h-full">
결과가 여기에 표시됩니다.
</div>
)}
</div> </div>
)}
</div>
</div> </div>
); );
}; };
export default UrlPredictor; export default UrlPredictor;

View File

@@ -0,0 +1,26 @@
{
"compilerOptions": {
"target": "es5",
"lib": [
"dom",
"dom.iterable",
"esnext"
],
"allowJs": true,
"skipLibCheck": true,
"esModuleInterop": true,
"allowSyntheticDefaultImports": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noFallthroughCasesInSwitch": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "preserve"
},
"include": [
"src"
]
}

BIN
scaler.pkl Normal file

Binary file not shown.