import re from collections import Counter from scipy.stats import entropy def calculate_url_entropy(url): counter = Counter(url) probabilities = [count / len(url) for count in counter.values()] return entropy(probabilities, base=2) def extract_url_features(url): suspicious_words = [ 'login', 'verify', 'update', 'confirm', 'account', 'secure', 'ebayisapi', 'banking' ] return { 'digit_count': len(re.findall(r'\d', url)), 'dash_count': url.count('-'), 'underscore_count': url.count('_'), 'percent_count': url.count('%'), 'equal_count': url.count('='), 'question_count': url.count('?'), 'at_count': url.count('@'), 'count_of_exclamation': url.count('!'), 'count_of_dot': url.count('.'), 'count_of_double_slash': url.count('//'), 'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)), 'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)), 'has_www': 'www' in url, 'suspicious_word_count': sum(word in url.lower() for word in suspicious_words), 'path_depth': url.count('/') - 2, 'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)), 'has_multiple_dash': bool(re.search(r'-{2,}', url)), 'has_https': url.startswith('https'), 'ends_with_common_extension': url.endswith(('.html', '.php')), 'url_length': len(url), # ✅ 추가 'url_entropy': calculate_url_entropy(url) # ✅ 추가 }