Files
1st-project/backend/app/predictor.py

73 lines
2.3 KiB
Python

import joblib
import os
from app.PreP import preprocess_single_url
import numpy as np
# model directory
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_DIR = os.path.join(BASE_DIR, "models")
'''
# 임의의 url을 입력하고 함수 적용하여 전처리
url_test = 'https://www.msn.com/ko-kr/news/other/%EC%84%9C%EB%B2%84%EC%97%90-%EC%82%AC%EC%A7%84-%EC%95%88-%EC%98%AC%EB%A6%AC%EA%B3%A0%EB%8F%84-%EC%A7%80%EB%B8%8C%EB%A6%AC%ED%92%8D%EC%9C%BC%EB%A1%9C-%EC%82%AC%EC%83%9D%ED%99%9C-%EC%B9%A8%ED%95%B4-%EB%A7%89%EB%8A%94-ai-%EA%B0%9C%EB%B0%9C/ar-AA1CyK6x?ocid=msedgntp&pc=U531&cvid=fdbbff03231b4babb8bf42b0036d8141&ei=9' # 테스트할 URL 입력
url_pre = preprocess_single_url(url_test)
'''
# 훈련에 사용할 features
features_cols = ['subdomain_count', 'letters_count', \
'digits_count', 'special_chars_count', 'use_of_ip', 'path_depth', \
'max_numeric_sequence', 'file_extension', 'special_char_count', \
'url_length_cat', 'suspicious_keyword_count', 'has_suspicious_keyword']
'''
# 적용할 데이터
input_data = url_pre[features_cols]
'''
## threshold 설정
BEST_THRESHOLD = 0.6563
## 모델 불러오기
# 단일 모델 불러오기
# model_load = joblib.load('xgboost_model_fold1.pkl')
# 여러 모델 불러오기
models_load = [joblib.load(os.path.join(MODEL_DIR, f'xgboost_model_fold{i+1}.pkl')) for i in range(4)]
## threshold 설정
BEST_THRESHOLD = 0.6563
def predict_url(url: str) -> dict:
try:
preprocessed = preprocess_single_url(url)
input_data = preprocessed[features_cols]
# ✅ 전처리된 데이터 확인
#print("Preprocessed input:", input_data)
# 평균 확률 계산
probs = [float(model.predict_proba(input_data)[0, 1]) for model in models_load]
mean_pred = float(np.mean(probs))
# 모델 예측 결과 확인
print("Model prediction result:", mean_pred)
# 진단 결과 판단
is_malicious = bool(mean_pred > BEST_THRESHOLD)
# 예: malicious_probability가 np.float32 타입일 경우
return {
"url": url,
"malicious_probability": mean_pred,
"is_malicious": is_malicious,
"threshold": float(BEST_THRESHOLD) # ⬅️ numpy -> float
}
except Exception as e:
return {"error": str(e)}