import re
from collections import Counter
from scipy.stats import entropy

def calculate_url_entropy(url):
    counter = Counter(url)
    probabilities = [count / len(url) for count in counter.values()]
    return entropy(probabilities, base=2)

def extract_url_features(url):
    suspicious_words = [
        'login', 'verify', 'update', 'confirm',
        'account', 'secure', 'ebayisapi', 'banking'
    ]

    return {
        'digit_count': len(re.findall(r'\d', url)),
        'dash_count': url.count('-'),
        'underscore_count': url.count('_'),
        'percent_count': url.count('%'),
        'equal_count': url.count('='),
        'question_count': url.count('?'),
        'at_count': url.count('@'),
        'count_of_exclamation': url.count('!'),
        'count_of_dot': url.count('.'),
        'count_of_double_slash': url.count('//'),
        'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
        'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
        'has_www': 'www' in url,
        'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
        'path_depth': url.count('/') - 2,
        'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
        'has_multiple_dash': bool(re.search(r'-{2,}', url)),
        'has_https': url.startswith('https'),
        'ends_with_common_extension': url.endswith(('.html', '.php')),
        'url_length': len(url),  # ✅ 추가
        'url_entropy': calculate_url_entropy(url)  # ✅ 추가
    }