Compare commits
14 Commits
6850f51341
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 8709a3cbc6 | |||
| ef6f0cb447 | |||
| 113bf7a747 | |||
| 52f20bfcea | |||
| 66f9061c4a | |||
| 8ccf9e8642 | |||
| 58056ce8a4 | |||
| 66f5c75d1e | |||
| 786ae98996 | |||
| 41dbe60e9a | |||
|
|
477fc5e159 | ||
| 8de5238395 | |||
|
|
11839c40c0 | ||
|
|
dcc3fc0a92 |
Submodule 1st-project deleted from 16f48f84a3
38
Nam/Feature.py
Normal file
38
Nam/Feature.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
from scipy.stats import entropy
|
||||||
|
|
||||||
|
def calculate_url_entropy(url):
|
||||||
|
counter = Counter(url)
|
||||||
|
probabilities = [count / len(url) for count in counter.values()]
|
||||||
|
return entropy(probabilities, base=2)
|
||||||
|
|
||||||
|
def extract_url_features(url):
|
||||||
|
suspicious_words = [
|
||||||
|
'login', 'verify', 'update', 'confirm',
|
||||||
|
'account', 'secure', 'ebayisapi', 'banking'
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'digit_count': len(re.findall(r'\d', url)),
|
||||||
|
'dash_count': url.count('-'),
|
||||||
|
'underscore_count': url.count('_'),
|
||||||
|
'percent_count': url.count('%'),
|
||||||
|
'equal_count': url.count('='),
|
||||||
|
'question_count': url.count('?'),
|
||||||
|
'at_count': url.count('@'),
|
||||||
|
'count_of_exclamation': url.count('!'),
|
||||||
|
'count_of_dot': url.count('.'),
|
||||||
|
'count_of_double_slash': url.count('//'),
|
||||||
|
'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
|
||||||
|
'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
|
||||||
|
'has_www': 'www' in url,
|
||||||
|
'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
|
||||||
|
'path_depth': url.count('/') - 2,
|
||||||
|
'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
|
||||||
|
'has_multiple_dash': bool(re.search(r'-{2,}', url)),
|
||||||
|
'has_https': url.startswith('https'),
|
||||||
|
'ends_with_common_extension': url.endswith(('.html', '.php')),
|
||||||
|
'url_length': len(url), # ✅ 추가
|
||||||
|
'url_entropy': calculate_url_entropy(url) # ✅ 추가
|
||||||
|
}
|
||||||
BIN
Nam/best_model 1.h5
Normal file
BIN
Nam/best_model 1.h5
Normal file
Binary file not shown.
54
Nam/model.running_code.py
Normal file
54
Nam/model.running_code.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import pickle
|
||||||
|
from tensorflow.keras.models import load_model
|
||||||
|
from Feature import extract_url_features
|
||||||
|
from collections import Counter
|
||||||
|
from scipy.stats import entropy
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
# 🔹 URL 엔트로피 계산 함수
|
||||||
|
def calculate_url_entropy(url):
|
||||||
|
counter = Counter(url)
|
||||||
|
probabilities = [count / len(url) for count in counter.values()]
|
||||||
|
return entropy(probabilities, base=2)
|
||||||
|
|
||||||
|
# 🔹 스케일러 불러오기
|
||||||
|
with open("scaler.pkl", "rb") as f:
|
||||||
|
scaler = pickle.load(f)
|
||||||
|
|
||||||
|
# 🔹 모델 불러오기
|
||||||
|
model = load_model("best_model.h5")
|
||||||
|
|
||||||
|
# 🔹 예측 함수
|
||||||
|
@tf.function(reduce_retracing=True)
|
||||||
|
def predict_with_model(model, input_data):
|
||||||
|
return model(input_data)
|
||||||
|
|
||||||
|
# 🔹 입력 URL 받기
|
||||||
|
url = input("URL입력 : ")
|
||||||
|
|
||||||
|
# 🔹 Feature.py에서 피처 추출
|
||||||
|
features = extract_url_features(url)
|
||||||
|
|
||||||
|
# 🔹 누락된 피처 보완
|
||||||
|
features['url_length'] = len(url)
|
||||||
|
features['url_entropy'] = calculate_url_entropy(url)
|
||||||
|
|
||||||
|
# 🔹 데이터프레임 생성 및 정렬
|
||||||
|
input_df = pd.DataFrame([features])
|
||||||
|
expected_columns = list(scaler.feature_names_in_)
|
||||||
|
input_df = input_df[expected_columns]
|
||||||
|
|
||||||
|
# 🔹 스케일링
|
||||||
|
input_scaled = scaler.transform(input_df)
|
||||||
|
|
||||||
|
# 🔹 예측
|
||||||
|
prediction = predict_with_model(model, input_scaled)
|
||||||
|
score = float(prediction.numpy()[0][0]) # 🔥 정확히 float으로 변환
|
||||||
|
|
||||||
|
# 🔹 출력
|
||||||
|
threshold = 0.5
|
||||||
|
if score > threshold:
|
||||||
|
print(f"악성 (악성일 확률: {score:.4f})")
|
||||||
|
else:
|
||||||
|
print(f"정상 (정상일 확률: {1 - score:.4f})")
|
||||||
BIN
Nam/model.scaler.pkl
Normal file
BIN
Nam/model.scaler.pkl
Normal file
Binary file not shown.
1795
Nam/url data_preprocessing_undersampling_val.ipynb
Normal file
1795
Nam/url data_preprocessing_undersampling_val.ipynb
Normal file
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
backend/app/__pycache__/predictor.cpython-310.pyc
Normal file
BIN
backend/app/__pycache__/predictor.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -1,52 +1,53 @@
|
|||||||
from app.junPreP import extract_features
|
from app.junPreP import extract_features
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pickle
|
import pickle
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
from tensorflow.keras.models import load_model
|
from tensorflow.keras.models import load_model
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정)
|
# 모델 및 스케일러 경로 (FastAPI 기준으로 맞춰서 절대 경로 또는 경로 설정)
|
||||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
MODEL_PATH = os.path.join(BASE_DIR, "models", "Recall_0.77.keras")
|
MODEL_PATH = os.path.join(BASE_DIR, "models", "White_list_model.keras")
|
||||||
SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl")
|
SCALER_PATH = os.path.join(BASE_DIR, "models", "scaler.pkl")
|
||||||
|
|
||||||
# 모델 및 스케일러 로드 (1회만 수행)
|
# 모델 및 스케일러 로드 (1회만 수행)
|
||||||
model = load_model(MODEL_PATH)
|
model = load_model(MODEL_PATH)
|
||||||
with open(SCALER_PATH, 'rb') as f:
|
with open(SCALER_PATH, 'rb') as f:
|
||||||
scaler = pickle.load(f)
|
scaler = pickle.load(f)
|
||||||
|
|
||||||
# @tf.function으로 추론 최적화
|
# @tf.function으로 추론 최적화
|
||||||
@tf.function(reduce_retracing=True)
|
@tf.function(reduce_retracing=True)
|
||||||
def predict_with_model(model, input_data):
|
def predict_with_model(model, input_data):
|
||||||
return model(input_data)
|
return model(input_data)
|
||||||
|
|
||||||
# Threshold (적절히 조정 가능)
|
# Threshold (적절히 조정 가능)
|
||||||
BEST_THRESHOLD = 0.4034
|
BEST_THRESHOLD = 0.4034
|
||||||
|
|
||||||
# 📦 예측 함수 정의 (FastAPI에서 import해서 사용)
|
# 📦 예측 함수 정의 (FastAPI에서 import해서 사용)
|
||||||
def predict_url_maliciousness(url: str) -> dict:
|
def predict_url_maliciousness(url: str) -> dict:
|
||||||
# 특성 추출
|
# 특성 추출
|
||||||
features = extract_features(url)
|
features = extract_features(url)
|
||||||
input_df = pd.DataFrame([list(features.values())], columns=features.keys())
|
input_df = pd.DataFrame([list(features.values())], columns=features.keys())
|
||||||
|
|
||||||
# 스케일링
|
# 스케일링
|
||||||
input_scaled = scaler.transform(input_df)
|
input_scaled = scaler.transform(input_df)
|
||||||
|
|
||||||
# 예측
|
# 예측
|
||||||
prediction = predict_with_model(model, input_scaled)
|
prediction = predict_with_model(model, input_scaled)
|
||||||
malicious_prob = float(prediction[0][0])
|
malicious_prob = float(prediction[0][0].numpy())
|
||||||
|
|
||||||
# 임계값 기반 판단
|
|
||||||
is_malicious = bool(malicious_prob > BEST_THRESHOLD)
|
# 임계값 기반 판단
|
||||||
|
is_malicious = bool(malicious_prob > BEST_THRESHOLD)
|
||||||
# Ensure all values are Python native types (not numpy types)
|
|
||||||
return {
|
# 예: malicious_probability가 np.float32 타입일 경우
|
||||||
"url": str(url),
|
return {
|
||||||
"malicious_probability": float(malicious_prob),
|
"url": str(url),
|
||||||
"is_malicious": bool(is_malicious),
|
"malicious_probability": malicious_prob,
|
||||||
"threshold": float(BEST_THRESHOLD)
|
"is_malicious": is_malicious,
|
||||||
}
|
"threshold": float(BEST_THRESHOLD)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,204 +1,274 @@
|
|||||||
import re
|
import re
|
||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import urlparse, parse_qs
|
||||||
import tldextract
|
import tldextract
|
||||||
import zlib
|
import zlib
|
||||||
import re
|
from collections import Counter
|
||||||
from urllib.parse import urlparse
|
import math
|
||||||
from collections import Counter
|
|
||||||
import math
|
def url_is_whitelisted(url):
|
||||||
|
trusted_domains = [
|
||||||
|
# 1. 포털 / 검색엔진
|
||||||
|
'naver.com', 'daum.net', 'google.com', 'bing.com', 'yahoo.com',
|
||||||
def check_similar_brand(url):
|
|
||||||
# 자주 사용되는 브랜드/도메인 목록
|
# 2. 소셜 미디어 / 커뮤니케이션
|
||||||
common_brands = {
|
'facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'linkedin.com',
|
||||||
'google', 'facebook', 'amazon', 'microsoft', 'apple',
|
'whatsapp.com', 'kakao.com', 'kakaocorp.com',
|
||||||
'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
|
|
||||||
'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
|
# 3. 동영상 / 스트리밍
|
||||||
'geocities', 'angelfire', 'newadvent', 'wikipedia',
|
'youtube.com', 'netflix.com', 'twitch.tv', 'tving.com', 'watcha.com',
|
||||||
}
|
|
||||||
|
# 4. 쇼핑 / 이커머스
|
||||||
# 2. 유사 브랜드 확인
|
'amazon.com', 'gmarket.co.kr', '11st.co.kr', 'coupang.com', 'ssg.com', 'wemakeprice.com',
|
||||||
try:
|
|
||||||
# URL 파싱
|
# 5. 금융 / 결제
|
||||||
parsed = urlparse(url if '//' in url else '//' + url)
|
'paypal.com', 'kbfg.com', 'shinhan.com', 'hanafn.com', 'wooribank.com',
|
||||||
domain = parsed.netloc.lower() if parsed.netloc else url.lower()
|
'kakaobank.com', 'toss.im',
|
||||||
|
|
||||||
for brand in common_brands:
|
# 6. 공공기관 / 교육
|
||||||
if brand not in domain:
|
'gov.kr', 'moe.go.kr', 'epeople.go.kr', 'pusan.ac.kr', 'ac.kr',
|
||||||
similar = False
|
|
||||||
# 비슷한 철자 패턴 확인
|
# 7. IT / 기술
|
||||||
patterns = [
|
'apple.com', 'microsoft.com', 'adobe.com', 'github.com', 'stackoverflow.com'
|
||||||
brand.replace('o', '0'),
|
]
|
||||||
brand.replace('i', '1'),
|
|
||||||
brand.replace('l', '1'),
|
try:
|
||||||
brand.replace('e', '3'),
|
domain = urlparse(url if '//' in url else '//' + url).netloc.lower()
|
||||||
brand.replace('a', '4'),
|
for trusted in trusted_domains:
|
||||||
brand.replace('s', '5'),
|
if domain.endswith(trusted):
|
||||||
brand + '-',
|
return True
|
||||||
brand + '_',
|
return False
|
||||||
brand[:-1], # 마지막 문자 제거
|
except:
|
||||||
''.join(c + c for c in brand), # 문자 중복
|
return False
|
||||||
]
|
|
||||||
|
|
||||||
for pattern in patterns:
|
|
||||||
if pattern in domain:
|
def check_similar_brand(url):
|
||||||
similar = True
|
# 자주 사용되는 브랜드/도메인 목록
|
||||||
break
|
common_brands = {
|
||||||
|
'google', 'facebook', 'amazon', 'microsoft', 'apple',
|
||||||
if similar:
|
'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
|
||||||
return True # 유사 브랜드가 발견되면 True 반환
|
'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
|
||||||
|
'geocities', 'angelfire', 'newadvent', 'wikipedia',
|
||||||
except Exception as e:
|
}
|
||||||
return False # 예외 발생 시 False 반환
|
|
||||||
|
# 2. 유사 브랜드 확인
|
||||||
return False # 유사 브랜드가 없으면 False 반환
|
try:
|
||||||
|
# URL 파싱
|
||||||
|
parsed = urlparse(url if '//' in url else '//' + url)
|
||||||
|
domain = parsed.netloc.lower() if parsed.netloc else url.lower()
|
||||||
# url 압축 비율 계산 함수
|
|
||||||
def compression_ratio(url: str) -> float:
|
for brand in common_brands:
|
||||||
if not url:
|
if brand not in domain:
|
||||||
return 0.0
|
similar = False
|
||||||
original_length = len(url.encode('utf-8'))
|
# 비슷한 철자 패턴 확인
|
||||||
compressed_data = zlib.compress(url.encode('utf-8'))
|
patterns = [
|
||||||
compressed_length = len(compressed_data)
|
brand.replace('o', '0'),
|
||||||
return compressed_length / original_length
|
brand.replace('i', '1'),
|
||||||
|
brand.replace('l', '1'),
|
||||||
|
brand.replace('e', '3'),
|
||||||
def extract_features(url):
|
brand.replace('a', '4'),
|
||||||
parsed_url = urlparse(url)
|
brand.replace('s', '5'),
|
||||||
suspicious_keywords = [
|
brand + '-',
|
||||||
'login', 'verify', 'account', 'update', 'secure', 'banking',
|
brand + '_',
|
||||||
'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
|
brand[:-1], # 마지막 문자 제거
|
||||||
'bonus', 'admin', 'support', 'server', 'password', 'click',
|
''.join(c + c for c in brand), # 문자 중복
|
||||||
'urgent', 'immediate', 'alert', 'security', 'prompt'
|
]
|
||||||
]
|
|
||||||
|
for pattern in patterns:
|
||||||
additional_keywords = [
|
if pattern in domain:
|
||||||
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
|
similar = True
|
||||||
'validation', 'authenticate', 'reset', 'recover', 'access',
|
break
|
||||||
'limited', 'offer', 'prize', 'win', 'winner', 'payment',
|
|
||||||
'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
|
if similar:
|
||||||
'unusual', 'activity', 'verify', 'document', 'invoice'
|
return True # 유사 브랜드가 발견되면 True 반환
|
||||||
]
|
|
||||||
|
except Exception as e:
|
||||||
all_keywords = list(set(suspicious_keywords + additional_keywords))
|
return False # 예외 발생 시 False 반환
|
||||||
|
|
||||||
contains_keyword = 0
|
return False # 유사 브랜드가 없으면 False 반환
|
||||||
keyword_count = 0
|
|
||||||
for keyword in all_keywords:
|
|
||||||
if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
|
|
||||||
contains_keyword = 1
|
# url 압축 비율 계산 함수
|
||||||
keyword_count += 1
|
def compression_ratio(url: str) -> float:
|
||||||
|
if not url:
|
||||||
url_length = len(url)
|
return 0.0
|
||||||
extracted = tldextract.extract(url)
|
original_length = len(url.encode('utf-8'))
|
||||||
tld = extracted.suffix
|
compressed_data = zlib.compress(url.encode('utf-8'))
|
||||||
domain = extracted.domain
|
compressed_length = len(compressed_data)
|
||||||
subdomain = extracted.subdomain
|
return compressed_length / original_length
|
||||||
|
|
||||||
tld_length = len(tld) if tld else 0
|
|
||||||
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
|
def extract_features(url):
|
||||||
is_common_tld = 1 if tld in common_tlds else 0
|
parsed_url = urlparse(url)
|
||||||
country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
|
suspicious_keywords = [
|
||||||
is_country_tld = 1 if tld in country_tlds else 0
|
'login', 'verify', 'account', 'update', 'secure', 'banking',
|
||||||
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
|
'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
|
||||||
is_suspicious_tld = 1 if tld in suspicious_tlds else 0
|
'bonus', 'admin', 'support', 'server', 'password', 'click',
|
||||||
url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
|
'urgent', 'immediate', 'alert', 'security', 'prompt'
|
||||||
full_domain = f"{domain}.{tld}" if tld else domain
|
]
|
||||||
is_shortened = 1 if full_domain in url_shorteners else 0
|
|
||||||
|
additional_keywords = [
|
||||||
|
'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
|
||||||
domain_length = len(domain) if domain else 0
|
'validation', 'authenticate', 'reset', 'recover', 'access',
|
||||||
has_subdomain = 1 if subdomain else 0
|
'limited', 'offer', 'prize', 'win', 'winner', 'payment',
|
||||||
subdomain_length = len(subdomain) if subdomain else 0
|
'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
|
||||||
subdomain_count = len(subdomain.split('.')) if subdomain else 0
|
'unusual', 'activity', 'verify', 'document', 'invoice'
|
||||||
|
]
|
||||||
path = parsed_url.path
|
|
||||||
path_length = len(path)
|
all_keywords = list(set(suspicious_keywords + additional_keywords))
|
||||||
path_depth = path.count('/') if path else 0
|
|
||||||
|
contains_keyword = 0
|
||||||
query = parsed_url.query
|
keyword_count = 0
|
||||||
has_query = 1 if query else 0
|
for keyword in all_keywords:
|
||||||
query_length = len(query) if query else 0
|
if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
|
||||||
query_params = parse_qs(query)
|
contains_keyword = 1
|
||||||
query_param_count = len(query_params) if query_params else 0
|
keyword_count += 1
|
||||||
|
|
||||||
has_fragment = 1 if parsed_url.fragment else 0
|
url_length = len(url)
|
||||||
fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0
|
extracted = tldextract.extract(url)
|
||||||
|
tld = extracted.suffix
|
||||||
# Character type ratios
|
domain = extracted.domain
|
||||||
letter_count = sum(c.isalpha() for c in url)
|
subdomain = extracted.subdomain
|
||||||
digit_count = sum(c.isdigit() for c in url)
|
|
||||||
special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))
|
tld_length = len(tld) if tld else 0
|
||||||
|
common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
|
||||||
letter_ratio = letter_count / url_length if url_length > 0 else 0
|
is_common_tld = 1 if tld in common_tlds else 0
|
||||||
digit_ratio = digit_count / url_length if url_length > 0 else 0
|
country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
|
||||||
special_char_ratio = special_char_count / url_length if url_length > 0 else 0
|
is_country_tld = 1 if tld in country_tlds else 0
|
||||||
|
suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
|
||||||
# Character distribution and entropy
|
is_suspicious_tld = 1 if tld in suspicious_tlds else 0
|
||||||
if url:
|
url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
|
||||||
char_counts = Counter(url)
|
full_domain = f"{domain}.{tld}" if tld else domain
|
||||||
total_chars = len(url)
|
is_shortened = 1 if full_domain in url_shorteners else 0
|
||||||
char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
|
|
||||||
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
|
|
||||||
else:
|
domain_length = len(domain) if domain else 0
|
||||||
entropy = 0
|
has_subdomain = 1 if subdomain else 0
|
||||||
|
subdomain_length = len(subdomain) if subdomain else 0
|
||||||
|
subdomain_count = len(subdomain.split('.')) if subdomain else 0
|
||||||
|
|
||||||
|
path = parsed_url.path
|
||||||
|
path_length = len(path)
|
||||||
if url_length <= 13:
|
path_depth = path.count('/') if path else 0
|
||||||
url_length_cat = 0
|
|
||||||
elif url_length <= 18:
|
query = parsed_url.query
|
||||||
url_length_cat = 1
|
has_query = 1 if query else 0
|
||||||
elif url_length <= 25:
|
query_length = len(query) if query else 0
|
||||||
url_length_cat = 2
|
query_params = parse_qs(query)
|
||||||
else:
|
query_param_count = len(query_params) if query_params else 0
|
||||||
url_length_cat = 3
|
|
||||||
|
has_fragment = 1 if parsed_url.fragment else 0
|
||||||
return {
|
fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0
|
||||||
# "url_length": url_length,
|
|
||||||
"url_length_cat": url_length_cat,
|
# Character type ratios
|
||||||
"num_dots": url.count("."),
|
letter_count = sum(c.isalpha() for c in url)
|
||||||
"num_digits": sum(c.isdigit() for c in url),
|
digit_count = sum(c.isdigit() for c in url)
|
||||||
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
|
special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))
|
||||||
"url_keyword": contains_keyword,
|
|
||||||
# "url_keyword_count": keyword_count,
|
letter_ratio = letter_count / url_length if url_length > 0 else 0
|
||||||
"num_underbar": url.count("_"),
|
digit_ratio = digit_count / url_length if url_length > 0 else 0
|
||||||
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
|
special_char_ratio = special_char_count / url_length if url_length > 0 else 0
|
||||||
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
|
|
||||||
"upper": int(any(c.isupper() for c in url)),
|
# Character distribution and entropy
|
||||||
|
if url:
|
||||||
"is_common_tld": is_common_tld,
|
char_counts = Counter(url)
|
||||||
"is country_tld": is_country_tld,
|
total_chars = len(url)
|
||||||
"is_suspicious_tld": is_suspicious_tld,
|
char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
|
||||||
|
entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
|
||||||
"domain_length": domain_length,
|
else:
|
||||||
"has_subdomain": has_subdomain,
|
entropy = 0
|
||||||
"subdomain_length": subdomain_length,
|
|
||||||
"subdomain_count": subdomain_count,
|
|
||||||
|
|
||||||
# "path_length": path_length,
|
|
||||||
"path_depth": path_depth,
|
|
||||||
"has_query": has_query,
|
if url_length <= 13:
|
||||||
"query_length": query_length,
|
url_length_cat = 0
|
||||||
"query_param_count": query_param_count,
|
elif url_length <= 18:
|
||||||
# "has_fragment": has_fragment,
|
url_length_cat = 1
|
||||||
# "fragment_length": fragment_length,
|
elif url_length <= 25:
|
||||||
"url_shorteners": is_shortened,
|
url_length_cat = 2
|
||||||
|
else:
|
||||||
# 새로 추가된 특성
|
url_length_cat = 3
|
||||||
"compression_ratio": compression_ratio(url),
|
|
||||||
"check_similar_brand" : check_similar_brand(url),
|
if url_is_whitelisted(url):
|
||||||
|
return {
|
||||||
# Advanced text analysis
|
# 화이트리스트 URL이면 특징값들을 "정상적"으로 처리되도록 설정
|
||||||
"entropy": entropy,
|
"url_length_cat": 1,
|
||||||
#"letter_ratio": letter_ratio,
|
"num_dots": 1,
|
||||||
"digit_ratio": digit_ratio,
|
"num_digits": 0,
|
||||||
"special_char_ratio": special_char_ratio
|
"num_special_chars": 1,
|
||||||
|
"url_keyword": 0,
|
||||||
|
"num_underbar": 0,
|
||||||
}
|
"extract_consecutive_numbers": 0,
|
||||||
|
"number": 0,
|
||||||
|
"upper": 0,
|
||||||
|
|
||||||
|
"is_common_tld": 1,
|
||||||
|
"is country_tld": 0,
|
||||||
|
"is_suspicious_tld": 0,
|
||||||
|
|
||||||
|
"domain_length": 5,
|
||||||
|
"has_subdomain": 0,
|
||||||
|
"subdomain_length": 0,
|
||||||
|
"subdomain_count": 0,
|
||||||
|
|
||||||
|
"path_depth": 0,
|
||||||
|
"has_query": 0,
|
||||||
|
"query_length": 0,
|
||||||
|
"query_param_count": 0,
|
||||||
|
"url_shorteners": 0,
|
||||||
|
|
||||||
|
"compression_ratio": 1.0,
|
||||||
|
"check_similar_brand": 0,
|
||||||
|
"entropy": 3.0,
|
||||||
|
"digit_ratio": 0.0,
|
||||||
|
"special_char_ratio": 0.1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
|
||||||
|
# "url_length": url_length,
|
||||||
|
"url_length_cat": url_length_cat,
|
||||||
|
"num_dots": url.count("."),
|
||||||
|
"num_digits": sum(c.isdigit() for c in url),
|
||||||
|
"num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
|
||||||
|
"url_keyword": contains_keyword,
|
||||||
|
# "url_keyword_count": keyword_count,
|
||||||
|
"num_underbar": url.count("_"),
|
||||||
|
"extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
|
||||||
|
"number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
|
||||||
|
"upper": int(any(c.isupper() for c in url)),
|
||||||
|
|
||||||
|
"is_common_tld": is_common_tld,
|
||||||
|
"is country_tld": is_country_tld,
|
||||||
|
"is_suspicious_tld": is_suspicious_tld,
|
||||||
|
|
||||||
|
"domain_length": domain_length,
|
||||||
|
"has_subdomain": has_subdomain,
|
||||||
|
"subdomain_length": subdomain_length,
|
||||||
|
"subdomain_count": subdomain_count,
|
||||||
|
|
||||||
|
# "path_length": path_length,
|
||||||
|
"path_depth": path_depth,
|
||||||
|
"has_query": has_query,
|
||||||
|
"query_length": query_length,
|
||||||
|
"query_param_count": query_param_count,
|
||||||
|
# "has_fragment": has_fragment,
|
||||||
|
# "fragment_length": fragment_length,
|
||||||
|
"url_shorteners": is_shortened,
|
||||||
|
|
||||||
|
# 새로 추가된 특성
|
||||||
|
"compression_ratio": compression_ratio(url),
|
||||||
|
"check_similar_brand" : check_similar_brand(url),
|
||||||
|
|
||||||
|
# Advanced text analysis
|
||||||
|
"entropy": entropy,
|
||||||
|
#"letter_ratio": letter_ratio,
|
||||||
|
"digit_ratio": digit_ratio,
|
||||||
|
"special_char_ratio": special_char_ratio
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ from fastapi import FastAPI
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from app.model_load import use_model # predictor.py에서 함수 import
|
from app.model_load import use_model # predictor.py에서 함수 import
|
||||||
from app.exe import predict_url_maliciousness
|
from app.exe import predict_url_maliciousness
|
||||||
from app.utils import convert_numpy_to_python_types
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
@@ -28,13 +27,15 @@ def root():
|
|||||||
def predict(request: UrlRequest):
|
def predict(request: UrlRequest):
|
||||||
url = request.url
|
url = request.url
|
||||||
|
|
||||||
result_model1 = convert_numpy_to_python_types(use_model(url))
|
result_model1 = use_model(url)
|
||||||
result_model2 = convert_numpy_to_python_types(predict_url_maliciousness(url))
|
result_model2 = predict_url_maliciousness(url)
|
||||||
|
# print("model1 : ")
|
||||||
response_data = {
|
# print(result_model1.values())
|
||||||
"url": url,
|
# print("model2 : ")
|
||||||
"model1": result_model1,
|
# print(result_model2.values())
|
||||||
"model2": result_model2
|
|
||||||
}
|
return {
|
||||||
|
"url" : url,
|
||||||
return convert_numpy_to_python_types(response_data)
|
"model1": result_model1,
|
||||||
|
"model2": result_model2
|
||||||
|
}
|
||||||
|
|||||||
@@ -29,6 +29,12 @@ def use_model(url : str):
|
|||||||
input_data = featured_df[features_cols]
|
input_data = featured_df[features_cols]
|
||||||
|
|
||||||
# 학습된 모델에 적용
|
# 학습된 모델에 적용
|
||||||
model_pred = round(float(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load])), 4)
|
model_pred = round(np.mean([model.predict_proba(input_data)[:, 1] for model in models_load]), 4)
|
||||||
|
|
||||||
return model_pred
|
#return model_pred
|
||||||
|
return {
|
||||||
|
"url" : url,
|
||||||
|
"malicious_probability" : float(model_pred),
|
||||||
|
"is_malicious" : bool(model_pred > best_threshold),
|
||||||
|
"threshold" : float(best_threshold)
|
||||||
|
}
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ def predict_url(url: str) -> dict:
|
|||||||
input_data = preprocessed[features_cols]
|
input_data = preprocessed[features_cols]
|
||||||
|
|
||||||
# ✅ 전처리된 데이터 확인
|
# ✅ 전처리된 데이터 확인
|
||||||
print("Preprocessed input:", input_data)
|
#print("Preprocessed input:", input_data)
|
||||||
|
|
||||||
# 평균 확률 계산
|
# 평균 확률 계산
|
||||||
probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load]
|
probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load]
|
||||||
@@ -61,8 +61,8 @@ def predict_url(url: str) -> dict:
|
|||||||
# 예: malicious_probability가 np.float32 타입일 경우
|
# 예: malicious_probability가 np.float32 타입일 경우
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"malicious_probability": mean_pred, # ⬅️ numpy -> float
|
"malicious_probability": mean_pred,
|
||||||
"is_malicious": bool(is_malicious), # ⬅️ numpy -> bool
|
"is_malicious": is_malicious,
|
||||||
"threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float
|
"threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
4
backend/app/testexe.py
Normal file
4
backend/app/testexe.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from exe import predict_url_maliciousness
|
||||||
|
|
||||||
|
result_model2 = predict_url_maliciousness("www.naver.com")
|
||||||
|
print(result_model2)
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
|
|
||||||
def convert_numpy_to_python_types(obj):
|
|
||||||
"""
|
|
||||||
Recursively convert numpy types to native Python types.
|
|
||||||
"""
|
|
||||||
if isinstance(obj, np.ndarray):
|
|
||||||
return convert_numpy_to_python_types(obj.tolist())
|
|
||||||
elif isinstance(obj, np.number):
|
|
||||||
return float(obj) if isinstance(obj, np.floating) else int(obj)
|
|
||||||
elif isinstance(obj, np.bool_):
|
|
||||||
return bool(obj)
|
|
||||||
elif isinstance(obj, dict):
|
|
||||||
return {k: convert_numpy_to_python_types(v) for k, v in obj.items()}
|
|
||||||
elif isinstance(obj, list) or isinstance(obj, tuple):
|
|
||||||
return [convert_numpy_to_python_types(item) for item in obj]
|
|
||||||
else:
|
|
||||||
return obj
|
|
||||||
BIN
best_model.h5
Normal file
BIN
best_model.h5
Normal file
Binary file not shown.
733
jun/code.ipynb
733
jun/code.ipynb
@@ -2392,703 +2392,56 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"ename": "NameError",
|
||||||
"application/vnd.microsoft.datawrangler.viewer.v0+json": {
|
"evalue": "name 'processed_train' is not defined",
|
||||||
"columns": [
|
"output_type": "error",
|
||||||
{
|
"traceback": [
|
||||||
"name": "index",
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
"rawType": "object",
|
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||||
"type": "string"
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocessed_train\u001b[49m.describe()\n",
|
||||||
},
|
"\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
|
||||||
{
|
]
|
||||||
"name": "label",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "url_length_cat",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "num_dots",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "num_digits",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "num_special_chars",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "url_keyword",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "num_underbar",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "extract_consecutive_numbers",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "number",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "upper",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "is_common_tld",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "is_country_tld",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "is_suspicious_tld",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "domain_length",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "has_subdomain",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "subdomain_length",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "subdomain_count",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "path_depth",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "has_query",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "query_length",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "query_param_count",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "url_shorteners",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "compression_ratio",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "entropy",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "digit_ratio",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "special_char_ratio",
|
|
||||||
"rawType": "float64",
|
|
||||||
"type": "float"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"conversionMethod": "pd.DataFrame",
|
|
||||||
"ref": "c79a077e-8e52-4e42-b88f-dc9698b0fa30",
|
|
||||||
"rows": [
|
|
||||||
[
|
|
||||||
"count",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0",
|
|
||||||
"6995056.0"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"mean",
|
|
||||||
"0.22371472079708868",
|
|
||||||
"1.4435534183000107",
|
|
||||||
"1.546944584861079",
|
|
||||||
"1.6343590387267808",
|
|
||||||
"2.6635716711917676",
|
|
||||||
"0.0370789025849114",
|
|
||||||
"0.045005501028154746",
|
|
||||||
"0.056463736673444787",
|
|
||||||
"0.08128040719044995",
|
|
||||||
"0.0357764112252997",
|
|
||||||
"0.6133649251700057",
|
|
||||||
"0.12739140329970197",
|
|
||||||
"0.022784949827420967",
|
|
||||||
"10.464007150192936",
|
|
||||||
"0.21130266862767075",
|
|
||||||
"2.43731000866898",
|
|
||||||
"0.2660177416735477",
|
|
||||||
"0.6056849294701858",
|
|
||||||
"0.027221368921135157",
|
|
||||||
"1.9155892390282507",
|
|
||||||
"0.04228915393958247",
|
|
||||||
"0.0018421582329004942",
|
|
||||||
"1.4552534994784176",
|
|
||||||
"3.5360434022769756",
|
|
||||||
"0.029042428345387533",
|
|
||||||
"0.1102289088601276"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"std",
|
|
||||||
"0.41673309122602675",
|
|
||||||
"1.1161203432813147",
|
|
||||||
"1.010078604927829",
|
|
||||||
"9.827940363271033",
|
|
||||||
"7.1618457272654",
|
|
||||||
"0.18895518694176003",
|
|
||||||
"0.6023702991784359",
|
|
||||||
"0.23081505741717664",
|
|
||||||
"0.273265280035072",
|
|
||||||
"0.18573223887275842",
|
|
||||||
"0.4869788780260291",
|
|
||||||
"0.33341093196934307",
|
|
||||||
"0.14921728811320575",
|
|
||||||
"5.0652546813544035",
|
|
||||||
"0.4082326232468674",
|
|
||||||
"6.90096602515224",
|
|
||||||
"0.6272395647222854",
|
|
||||||
"1.6003209664806863",
|
|
||||||
"0.1627279010519657",
|
|
||||||
"19.702068343354906",
|
|
||||||
"0.35208851309719974",
|
|
||||||
"0.04288082262284407",
|
|
||||||
"0.24856536988340924",
|
|
||||||
"0.47898938276414027",
|
|
||||||
"0.08255957016074264",
|
|
||||||
"0.046338026902092454"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"min",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.010181818181818183",
|
|
||||||
"-0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"25%",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"7.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.3076923076923077",
|
|
||||||
"3.238901256602631",
|
|
||||||
"0.0",
|
|
||||||
"0.07142857142857142"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"50%",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"2.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"10.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.4444444444444444",
|
|
||||||
"3.5068905956085183",
|
|
||||||
"0.0",
|
|
||||||
"0.10344827586206896"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"75%",
|
|
||||||
"0.0",
|
|
||||||
"2.0",
|
|
||||||
"2.0",
|
|
||||||
"0.0",
|
|
||||||
"3.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"13.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"0.0",
|
|
||||||
"1.6153846153846154",
|
|
||||||
"3.7962176025900556",
|
|
||||||
"0.0",
|
|
||||||
"0.14285714285714285"
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"max",
|
|
||||||
"1.0",
|
|
||||||
"3.0",
|
|
||||||
"171.0",
|
|
||||||
"2011.0",
|
|
||||||
"8198.0",
|
|
||||||
"1.0",
|
|
||||||
"136.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"1.0",
|
|
||||||
"63.0",
|
|
||||||
"1.0",
|
|
||||||
"237.0",
|
|
||||||
"38.0",
|
|
||||||
"136.0",
|
|
||||||
"1.0",
|
|
||||||
"8367.0",
|
|
||||||
"131.0",
|
|
||||||
"1.0",
|
|
||||||
"5.0",
|
|
||||||
"6.570554108088201",
|
|
||||||
"0.9545454545454546",
|
|
||||||
"1.0"
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"shape": {
|
|
||||||
"columns": 26,
|
|
||||||
"rows": 8
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>label</th>\n",
|
|
||||||
" <th>url_length_cat</th>\n",
|
|
||||||
" <th>num_dots</th>\n",
|
|
||||||
" <th>num_digits</th>\n",
|
|
||||||
" <th>num_special_chars</th>\n",
|
|
||||||
" <th>url_keyword</th>\n",
|
|
||||||
" <th>num_underbar</th>\n",
|
|
||||||
" <th>extract_consecutive_numbers</th>\n",
|
|
||||||
" <th>number</th>\n",
|
|
||||||
" <th>upper</th>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <th>subdomain_count</th>\n",
|
|
||||||
" <th>path_depth</th>\n",
|
|
||||||
" <th>has_query</th>\n",
|
|
||||||
" <th>query_length</th>\n",
|
|
||||||
" <th>query_param_count</th>\n",
|
|
||||||
" <th>url_shorteners</th>\n",
|
|
||||||
" <th>compression_ratio</th>\n",
|
|
||||||
" <th>entropy</th>\n",
|
|
||||||
" <th>digit_ratio</th>\n",
|
|
||||||
" <th>special_char_ratio</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>count</th>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" <td>6.995056e+06</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>mean</th>\n",
|
|
||||||
" <td>2.237147e-01</td>\n",
|
|
||||||
" <td>1.443553e+00</td>\n",
|
|
||||||
" <td>1.546945e+00</td>\n",
|
|
||||||
" <td>1.634359e+00</td>\n",
|
|
||||||
" <td>2.663572e+00</td>\n",
|
|
||||||
" <td>3.707890e-02</td>\n",
|
|
||||||
" <td>4.500550e-02</td>\n",
|
|
||||||
" <td>5.646374e-02</td>\n",
|
|
||||||
" <td>8.128041e-02</td>\n",
|
|
||||||
" <td>3.577641e-02</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>2.660177e-01</td>\n",
|
|
||||||
" <td>6.056849e-01</td>\n",
|
|
||||||
" <td>2.722137e-02</td>\n",
|
|
||||||
" <td>1.915589e+00</td>\n",
|
|
||||||
" <td>4.228915e-02</td>\n",
|
|
||||||
" <td>1.842158e-03</td>\n",
|
|
||||||
" <td>1.455253e+00</td>\n",
|
|
||||||
" <td>3.536043e+00</td>\n",
|
|
||||||
" <td>2.904243e-02</td>\n",
|
|
||||||
" <td>1.102289e-01</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>std</th>\n",
|
|
||||||
" <td>4.167331e-01</td>\n",
|
|
||||||
" <td>1.116120e+00</td>\n",
|
|
||||||
" <td>1.010079e+00</td>\n",
|
|
||||||
" <td>9.827940e+00</td>\n",
|
|
||||||
" <td>7.161846e+00</td>\n",
|
|
||||||
" <td>1.889552e-01</td>\n",
|
|
||||||
" <td>6.023703e-01</td>\n",
|
|
||||||
" <td>2.308151e-01</td>\n",
|
|
||||||
" <td>2.732653e-01</td>\n",
|
|
||||||
" <td>1.857322e-01</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>6.272396e-01</td>\n",
|
|
||||||
" <td>1.600321e+00</td>\n",
|
|
||||||
" <td>1.627279e-01</td>\n",
|
|
||||||
" <td>1.970207e+01</td>\n",
|
|
||||||
" <td>3.520885e-01</td>\n",
|
|
||||||
" <td>4.288082e-02</td>\n",
|
|
||||||
" <td>2.485654e-01</td>\n",
|
|
||||||
" <td>4.789894e-01</td>\n",
|
|
||||||
" <td>8.255957e-02</td>\n",
|
|
||||||
" <td>4.633803e-02</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>min</th>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.018182e-02</td>\n",
|
|
||||||
" <td>-0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>25%</th>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.307692e+00</td>\n",
|
|
||||||
" <td>3.238901e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>7.142857e-02</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>50%</th>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>2.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.444444e+00</td>\n",
|
|
||||||
" <td>3.506891e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.034483e-01</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>75%</th>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>2.000000e+00</td>\n",
|
|
||||||
" <td>2.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>3.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.615385e+00</td>\n",
|
|
||||||
" <td>3.796218e+00</td>\n",
|
|
||||||
" <td>0.000000e+00</td>\n",
|
|
||||||
" <td>1.428571e-01</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>max</th>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>3.000000e+00</td>\n",
|
|
||||||
" <td>1.710000e+02</td>\n",
|
|
||||||
" <td>2.011000e+03</td>\n",
|
|
||||||
" <td>8.198000e+03</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>1.360000e+02</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>3.800000e+01</td>\n",
|
|
||||||
" <td>1.360000e+02</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>8.367000e+03</td>\n",
|
|
||||||
" <td>1.310000e+02</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" <td>5.000000e+00</td>\n",
|
|
||||||
" <td>6.570554e+00</td>\n",
|
|
||||||
" <td>9.545455e-01</td>\n",
|
|
||||||
" <td>1.000000e+00</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"<p>8 rows × 26 columns</p>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" label url_length_cat num_dots num_digits \\\n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
|
|
||||||
"mean 2.237147e-01 1.443553e+00 1.546945e+00 1.634359e+00 \n",
|
|
||||||
"std 4.167331e-01 1.116120e+00 1.010079e+00 9.827940e+00 \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"25% 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n",
|
|
||||||
"50% 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 \n",
|
|
||||||
"75% 0.000000e+00 2.000000e+00 2.000000e+00 0.000000e+00 \n",
|
|
||||||
"max 1.000000e+00 3.000000e+00 1.710000e+02 2.011000e+03 \n",
|
|
||||||
"\n",
|
|
||||||
" num_special_chars url_keyword num_underbar \\\n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 6.995056e+06 \n",
|
|
||||||
"mean 2.663572e+00 3.707890e-02 4.500550e-02 \n",
|
|
||||||
"std 7.161846e+00 1.889552e-01 6.023703e-01 \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"25% 1.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"50% 2.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"75% 3.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"max 8.198000e+03 1.000000e+00 1.360000e+02 \n",
|
|
||||||
"\n",
|
|
||||||
" extract_consecutive_numbers number upper ... \\\n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 6.995056e+06 ... \n",
|
|
||||||
"mean 5.646374e-02 8.128041e-02 3.577641e-02 ... \n",
|
|
||||||
"std 2.308151e-01 2.732653e-01 1.857322e-01 ... \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
|
|
||||||
"25% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
|
|
||||||
"50% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
|
|
||||||
"75% 0.000000e+00 0.000000e+00 0.000000e+00 ... \n",
|
|
||||||
"max 1.000000e+00 1.000000e+00 1.000000e+00 ... \n",
|
|
||||||
"\n",
|
|
||||||
" subdomain_count path_depth has_query query_length \\\n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
|
|
||||||
"mean 2.660177e-01 6.056849e-01 2.722137e-02 1.915589e+00 \n",
|
|
||||||
"std 6.272396e-01 1.600321e+00 1.627279e-01 1.970207e+01 \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"75% 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"max 3.800000e+01 1.360000e+02 1.000000e+00 8.367000e+03 \n",
|
|
||||||
"\n",
|
|
||||||
" query_param_count url_shorteners compression_ratio entropy \\\n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 6.995056e+06 6.995056e+06 \n",
|
|
||||||
"mean 4.228915e-02 1.842158e-03 1.455253e+00 3.536043e+00 \n",
|
|
||||||
"std 3.520885e-01 4.288082e-02 2.485654e-01 4.789894e-01 \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 1.018182e-02 -0.000000e+00 \n",
|
|
||||||
"25% 0.000000e+00 0.000000e+00 1.307692e+00 3.238901e+00 \n",
|
|
||||||
"50% 0.000000e+00 0.000000e+00 1.444444e+00 3.506891e+00 \n",
|
|
||||||
"75% 0.000000e+00 0.000000e+00 1.615385e+00 3.796218e+00 \n",
|
|
||||||
"max 1.310000e+02 1.000000e+00 5.000000e+00 6.570554e+00 \n",
|
|
||||||
"\n",
|
|
||||||
" digit_ratio special_char_ratio \n",
|
|
||||||
"count 6.995056e+06 6.995056e+06 \n",
|
|
||||||
"mean 2.904243e-02 1.102289e-01 \n",
|
|
||||||
"std 8.255957e-02 4.633803e-02 \n",
|
|
||||||
"min 0.000000e+00 0.000000e+00 \n",
|
|
||||||
"25% 0.000000e+00 7.142857e-02 \n",
|
|
||||||
"50% 0.000000e+00 1.034483e-01 \n",
|
|
||||||
"75% 0.000000e+00 1.428571e-01 \n",
|
|
||||||
"max 9.545455e-01 1.000000e+00 \n",
|
|
||||||
"\n",
|
|
||||||
"[8 rows x 26 columns]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"processed_train.describe()"
|
"processed_train.describe()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "NameError",
|
||||||
|
"evalue": "name 'processed_train' is not defined",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msns\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m desc = \u001b[43mprocessed_train\u001b[49m.describe()\n\u001b[32m 6\u001b[39m plt.figure(figsize=(\u001b[32m12\u001b[39m, \u001b[32m6\u001b[39m))\n\u001b[32m 7\u001b[39m sns.barplot(data=desc.T[[\u001b[33m'\u001b[39m\u001b[33mmean\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mstd\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmin\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mmax\u001b[39m\u001b[33m'\u001b[39m]])\n",
|
||||||
|
"\u001b[31mNameError\u001b[39m: name 'processed_train' is not defined"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
|
"\n",
|
||||||
|
"desc = processed_train.describe()\n",
|
||||||
|
"\n",
|
||||||
|
"plt.figure(figsize=(12, 6))\n",
|
||||||
|
"sns.barplot(data=desc.T[['mean', 'std', 'min', 'max']])\n",
|
||||||
|
"plt.title('Feature Statistics')\n",
|
||||||
|
"plt.xticks(rotation=45)\n",
|
||||||
|
"plt.tight_layout()\n",
|
||||||
|
"plt.show()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 11,
|
||||||
@@ -3248,12 +2601,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import tensorflow as tf\n",
|
"import tensorflow as tf\n",
|
||||||
"from tensorflow.keras.layers import Dense, Dropout, BatchNormalization\n",
|
"from tensorflow.keras.layers import Dense\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def build_model(input_dim, learning_rate=0.001):\n",
|
"def build_model(input_dim, learning_rate=0.001):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
|
|||||||
34744
react-url-checker/package-lock.json
generated
34744
react-url-checker/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -12,10 +12,11 @@
|
|||||||
"@types/react": "^19.1.0",
|
"@types/react": "^19.1.0",
|
||||||
"@types/react-dom": "^19.1.1",
|
"@types/react-dom": "^19.1.1",
|
||||||
"axios": "^1.8.4",
|
"axios": "^1.8.4",
|
||||||
|
"framer-motion": "^12.9.2",
|
||||||
"react": "^19.1.0",
|
"react": "^19.1.0",
|
||||||
"react-dom": "^19.1.0",
|
"react-dom": "^19.1.0",
|
||||||
"react-icons": "^5.5.0",
|
"react-icons": "^5.5.0",
|
||||||
"react-scripts": "^3.0.1",
|
"react-scripts": "^5.0.1",
|
||||||
"web-vitals": "^2.1.4"
|
"web-vitals": "^2.1.4"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
@@ -43,9 +44,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@babel/preset-react": "^7.26.3",
|
||||||
"autoprefixer": "^10.4.21",
|
"autoprefixer": "^10.4.21",
|
||||||
|
"eslint": "^8.57.1",
|
||||||
"postcss": "^8.5.3",
|
"postcss": "^8.5.3",
|
||||||
"tailwindcss": "^3.3.5",
|
"tailwindcss": "^3.3.5",
|
||||||
"typescript": "^5.3.3"
|
"typescript": "^4.1.2"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>URL 악성 판별기</title>
|
<title></title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="root"></div>
|
<div id="root"></div>
|
||||||
|
|||||||
3
react-url-checker/src/App.js
vendored
3
react-url-checker/src/App.js
vendored
@@ -5,9 +5,8 @@ import './App.css';
|
|||||||
|
|
||||||
function App() {
|
function App() {
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-gray-100 flex flex-col justify-center">
|
<div className="min-h-screen bg-sky-200 flex flex-col justify-center">
|
||||||
<div className="container mx-auto px-4 text-center">
|
<div className="container mx-auto px-4 text-center">
|
||||||
<h1 className="text-3xl font-bold text-blue-600 mb-6">🔍 악성 URL 판별기</h1>
|
|
||||||
<UrlPredictor />
|
<UrlPredictor />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
import React, { useState } from "react";
|
import React, { useState } from "react";
|
||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { motion } from "framer-motion"; // 애니메이션용
|
||||||
|
import { FaSearch, FaRedo } from "react-icons/fa"; // 아이콘용
|
||||||
|
|
||||||
const UrlPredictor = () => {
|
const UrlPredictor = () => {
|
||||||
const [url, setUrl] = useState("");
|
const [url, setUrl] = useState("");
|
||||||
@@ -26,81 +28,90 @@ const UrlPredictor = () => {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 모델 정보 정의 (title + 키)
|
|
||||||
const models = [
|
const models = [
|
||||||
{ key: "old_model", title: "🧠 기존 모델 (Ho)" },
|
{ key: "model1", title: "HO 모델" },
|
||||||
{ key: "new_model", title: "🚀 개선 모델 (Jun)" },
|
{ key: "model2", title: "Jun 모델" },
|
||||||
];
|
];
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-gray-100 p-6">
|
<div className="min-h-screen bg-blue-50 p-8">
|
||||||
{!results ? (
|
<div className="grid grid-cols-1 md:grid-cols-2 gap-8 h-full">
|
||||||
<div className="flex justify-center items-center h-full">
|
|
||||||
<form onSubmit={handleSubmit} className="flex gap-4 w-full max-w-2xl">
|
{/* 왼쪽 입력창 */}
|
||||||
|
<div className="flex flex-col justify-center items-center gap-6">
|
||||||
|
<h1 className="text-2x1 font-bold text-blue-700">URL 판별기</h1>
|
||||||
|
<form onSubmit={handleSubmit} className="flex gap-2 w-full max-w-md">
|
||||||
<input
|
<input
|
||||||
type="text"
|
type="text"
|
||||||
value={url}
|
value={url}
|
||||||
onChange={(e) => setUrl(e.target.value)}
|
onChange={(e) => setUrl(e.target.value)}
|
||||||
placeholder="URL을 입력하세요"
|
placeholder="URL을 입력하세요"
|
||||||
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow"
|
className="flex-grow px-4 py-2 border border-gray-300 rounded-lg shadow-md focus:outline-none focus:ring-2 focus:ring-blue-400"
|
||||||
required
|
required
|
||||||
/>
|
/>
|
||||||
<button
|
<button
|
||||||
type="submit"
|
type="submit"
|
||||||
className="bg-blue-600 text-white px-6 py-2 rounded shadow hover:bg-blue-700 transition"
|
className="bg-blue-600 text-white px-6 py-2 flex items-center gap-2 rounded-lg shadow-md hover:bg-blue-700 transition"
|
||||||
>
|
>
|
||||||
✅ 검사하기
|
<FaSearch /> 검사
|
||||||
</button>
|
</button>
|
||||||
</form>
|
</form>
|
||||||
|
{loading && (
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<div className="w-6 h-6 border-4 border-blue-400 border-t-transparent rounded-full animate-spin"></div>
|
||||||
|
<p className="text-blue-600 font-semibold">분석 중...</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{error && <p className="text-red-500">❌ {error}</p>}
|
||||||
</div>
|
</div>
|
||||||
) : (
|
|
||||||
<div className="grid grid-cols-2 gap-6">
|
|
||||||
{/* 좌측 입력창 */}
|
|
||||||
<div className="flex flex-col gap-4">
|
|
||||||
<form onSubmit={handleSubmit} className="flex gap-2">
|
|
||||||
<input
|
|
||||||
type="text"
|
|
||||||
value={url}
|
|
||||||
onChange={(e) => setUrl(e.target.value)}
|
|
||||||
className="flex-grow px-4 py-2 border border-gray-300 rounded shadow"
|
|
||||||
placeholder="URL을 다시 입력해보세요"
|
|
||||||
required
|
|
||||||
/>
|
|
||||||
<button
|
|
||||||
type="submit"
|
|
||||||
className="bg-blue-600 text-white px-4 py-2 rounded hover:bg-blue-700 transition"
|
|
||||||
>
|
|
||||||
다시 검사
|
|
||||||
</button>
|
|
||||||
</form>
|
|
||||||
{loading && <p>🔍 분석 중...</p>}
|
|
||||||
{error && <p className="text-red-500">❌ {error}</p>}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* 우측 결과 반복 렌더링 */}
|
{/* 오른쪽 결과창 */}
|
||||||
<div className="flex flex-col gap-4">
|
<div className="flex flex-col gap-6">
|
||||||
{models.map((model) => {
|
{results ? (
|
||||||
|
models.map((model) => {
|
||||||
const data = results[model.key];
|
const data = results[model.key];
|
||||||
|
if (!data) return null;
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div key={model.key} className="bg-white rounded p-4 shadow">
|
<motion.div
|
||||||
<h2 className="text-lg font-bold mb-2">{model.title}</h2>
|
key={model.key}
|
||||||
<p>
|
initial={{ opacity: 0, y: 30 }}
|
||||||
악성 확률: <strong>{(data.prob * 100).toFixed(2)}%</strong>
|
animate={{ opacity: 1, y: 0 }}
|
||||||
|
transition={{ duration: 0.6 }}
|
||||||
|
className="bg-white rounded-2xl p-6 shadow-lg border border-gray-200"
|
||||||
|
>
|
||||||
|
<h2 className="text-xl font-bold mb-4 text-gray-800">{model.title}</h2>
|
||||||
|
<p className="mb-2 text-gray-700">
|
||||||
|
악성 확률:{" "}
|
||||||
|
<strong>
|
||||||
|
{(data.malicious_probability * 100).toFixed(2)}%
|
||||||
|
</strong>
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
판별 결과:{" "}
|
판별 결과:{" "}
|
||||||
<strong className={data.malicious ? "text-red-600" : "text-green-600"}>
|
<strong
|
||||||
{data.malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"}
|
className={
|
||||||
|
data.is_malicious
|
||||||
|
? "text-red-600"
|
||||||
|
: "text-green-600"
|
||||||
|
}
|
||||||
|
>
|
||||||
|
{data.is_malicious ? "⚠️ 악성 URL" : "✅ 정상 URL"}
|
||||||
</strong>
|
</strong>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</motion.div>
|
||||||
);
|
);
|
||||||
})}
|
})
|
||||||
</div>
|
) : (
|
||||||
|
<div className="text-gray-500 flex items-center justify-center h-full">
|
||||||
|
결과가 여기에 표시됩니다.
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)}
|
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
export default UrlPredictor;
|
export default UrlPredictor;
|
||||||
26
react-url-checker/tsconfig (copy 1).json
Normal file
26
react-url-checker/tsconfig (copy 1).json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "es5",
|
||||||
|
"lib": [
|
||||||
|
"dom",
|
||||||
|
"dom.iterable",
|
||||||
|
"esnext"
|
||||||
|
],
|
||||||
|
"allowJs": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"allowSyntheticDefaultImports": true,
|
||||||
|
"strict": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"noFallthroughCasesInSwitch": true,
|
||||||
|
"module": "esnext",
|
||||||
|
"moduleResolution": "node",
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"isolatedModules": true,
|
||||||
|
"noEmit": true,
|
||||||
|
"jsx": "preserve"
|
||||||
|
},
|
||||||
|
"include": [
|
||||||
|
"src"
|
||||||
|
]
|
||||||
|
}
|
||||||
BIN
scaler.pkl
Normal file
BIN
scaler.pkl
Normal file
Binary file not shown.
Reference in New Issue
Block a user