1st-project/jun/Preprocessing_application_code.py

import re
from urllib.parse import urlparse, parse_qs
import tldextract
import zlib
from collections import Counter
import math

def url_is_whitelisted(url):
    trusted_domains = [
    # 1. 포털 / 검색엔진
    'naver.com', 'daum.net', 'google.com', 'bing.com', 'yahoo.com',

    # 2. 소셜 미디어 / 커뮤니케이션
    'facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'linkedin.com',
    'whatsapp.com', 'kakao.com', 'kakaocorp.com',

    # 3. 동영상 / 스트리밍
    'youtube.com', 'netflix.com', 'twitch.tv', 'tving.com', 'watcha.com',

    # 4. 쇼핑 / 이커머스
    'amazon.com', 'gmarket.co.kr', '11st.co.kr', 'coupang.com', 'ssg.com', 'wemakeprice.com',

    # 5. 금융 / 결제
    'paypal.com', 'kbfg.com', 'shinhan.com', 'hanafn.com', 'wooribank.com',
    'kakaobank.com', 'toss.im',

    # 6. 공공기관 / 교육
    'gov.kr', 'moe.go.kr', 'epeople.go.kr', 'pusan.ac.kr', 'ac.kr',

    # 7. IT / 기술
    'apple.com', 'microsoft.com', 'adobe.com', 'github.com', 'stackoverflow.com'
]

    try:
        domain = urlparse(url if '//' in url else '//' + url).netloc.lower()
        for trusted in trusted_domains:
            if domain.endswith(trusted):
                return True
        return False
    except:
        return False


def check_similar_brand(url):
    # 자주 사용되는 브랜드/도메인 목록
    common_brands = {
        'google', 'facebook', 'amazon', 'microsoft', 'apple',
        'netflix', 'paypal', 'twitter', 'instagram', 'linkedin',
        'youtube', 'yahoo', 'gmail', 'whatsapp', 'tiktok',
        'geocities', 'angelfire', 'newadvent', 'wikipedia',
    }

    # 2. 유사 브랜드 확인
    try:
        # URL 파싱
        parsed = urlparse(url if '//' in url else '//' + url)
        domain = parsed.netloc.lower() if parsed.netloc else url.lower()

        for brand in common_brands:
            if brand not in domain:
                similar = False
                # 비슷한 철자 패턴 확인
                patterns = [
                    brand.replace('o', '0'),
                    brand.replace('i', '1'),
                    brand.replace('l', '1'),
                    brand.replace('e', '3'),
                    brand.replace('a', '4'),
                    brand.replace('s', '5'),
                    brand + '-',
                    brand + '_',
                    brand[:-1],  # 마지막 문자 제거
                    ''.join(c + c for c in brand),  # 문자 중복
                ]

                for pattern in patterns:
                    if pattern in domain:
                        similar = True
                        break

                if similar:
                    return True  # 유사 브랜드가 발견되면 True 반환

    except Exception as e:
        return False  # 예외 발생 시 False 반환

    return False  # 유사 브랜드가 없으면 False 반환


# url 압축 비율 계산 함수
def compression_ratio(url: str) -> float:
    if not url:
        return 0.0
    original_length = len(url.encode('utf-8'))
    compressed_data = zlib.compress(url.encode('utf-8'))
    compressed_length = len(compressed_data)
    return compressed_length / original_length


def extract_features(url):
    parsed_url = urlparse(url)
    suspicious_keywords = [
        'login', 'verify', 'account', 'update', 'secure', 'banking',
        'paypal', 'confirm', 'signin', 'auth', 'redirect', 'free',
        'bonus', 'admin', 'support', 'server', 'password', 'click',
        'urgent', 'immediate', 'alert', 'security', 'prompt'
    ]

    additional_keywords = [
        'verify', 'wallet', 'cryptocurrency', 'bitcoin', 'ethereum',
        'validation', 'authenticate', 'reset', 'recover', 'access',
        'limited', 'offer', 'prize', 'win', 'winner', 'payment',
        'bank', 'credit', 'debit', 'card', 'expire', 'suspension',
        'unusual', 'activity', 'verify', 'document', 'invoice'
    ]

    all_keywords = list(set(suspicious_keywords + additional_keywords))

    contains_keyword = 0
    keyword_count = 0
    for keyword in all_keywords:
        if re.search(r'\b' + keyword + r'\b', url, re.IGNORECASE):
            contains_keyword = 1
            keyword_count += 1

    url_length = len(url)
    extracted = tldextract.extract(url)
    tld = extracted.suffix
    domain = extracted.domain
    subdomain = extracted.subdomain

    tld_length = len(tld) if tld else 0
    common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'info', 'biz']
    is_common_tld = 1 if tld in common_tlds else 0
    country_tlds = ['us', 'uk', 'ca', 'au', 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es']
    is_country_tld = 1 if tld in country_tlds else 0
    suspicious_tlds = ['xyz', 'top', 'club', 'online', 'site', 'icu', 'vip', 'work', 'rest', 'fit']
    is_suspicious_tld = 1 if tld in suspicious_tlds else 0
    url_shorteners = ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'buff.ly', 'adf.ly', 'tiny.cc']
    full_domain = f"{domain}.{tld}" if tld else domain
    is_shortened = 1 if full_domain in url_shorteners else 0


    domain_length = len(domain) if domain else 0
    has_subdomain = 1 if subdomain else 0
    subdomain_length = len(subdomain) if subdomain else 0
    subdomain_count = len(subdomain.split('.')) if subdomain else 0

    path = parsed_url.path
    path_length = len(path)
    path_depth = path.count('/') if path else 0

    query = parsed_url.query
    has_query = 1 if query else 0
    query_length = len(query) if query else 0
    query_params = parse_qs(query)
    query_param_count = len(query_params) if query_params else 0

    has_fragment = 1 if parsed_url.fragment else 0
    fragment_length = len(parsed_url.fragment) if parsed_url.fragment else 0

    # Character type ratios
    letter_count = sum(c.isalpha() for c in url)
    digit_count = sum(c.isdigit() for c in url)
    special_char_count = len(re.findall(r'[^a-zA-Z0-9]', url))

    letter_ratio = letter_count / url_length if url_length > 0 else 0
    digit_ratio = digit_count / url_length if url_length > 0 else 0
    special_char_ratio = special_char_count / url_length if url_length > 0 else 0

    # Character distribution and entropy
    if url:
        char_counts = Counter(url)
        total_chars = len(url)
        char_frequencies = {char: count/total_chars for char, count in char_counts.items()}
        entropy = -sum(freq * math.log2(freq) for freq in char_frequencies.values())
    else:
        entropy = 0


    if url_length <= 13:
        url_length_cat = 0
    elif url_length <= 18:
        url_length_cat = 1
    elif url_length <= 25:
        url_length_cat = 2
    else:
        url_length_cat = 3

    if url_is_whitelisted(url):
        return {
            # 화이트리스트 URL이면 특징값들을 "정상적"으로 처리되도록 설정
            "url_length_cat": 1,
            "num_dots": 1,
            "num_digits": 0,
            "num_special_chars": 1,
            "url_keyword": 0,
            "num_underbar": 0,
            "extract_consecutive_numbers": 0,
            "number": 0,
            "upper": 0,

            "is_common_tld": 1,
            "is_country_tld": 0,
            "is_suspicious_tld": 0,

            "domain_length": 5,
            "has_subdomain": 0,
            "subdomain_length": 0,
            "subdomain_count": 0,

            "path_depth": 0,
            "has_query": 0,
            "query_length": 0,
            "query_param_count": 0,
            "url_shorteners": 0,

            "compression_ratio": 1.0,
            "check_similar_brand": 0,
            "entropy": 3.0,
            "digit_ratio": 0.0,
            "special_char_ratio": 0.1
        }


    return {

        # "url_length": url_length,
        "url_length_cat": url_length_cat,
        "num_dots": url.count("."),
        "num_digits": sum(c.isdigit() for c in url),
        "num_special_chars": len(re.findall(r"[^a-zA-Z0-9]", url)),
        "url_keyword": contains_keyword,
        # "url_keyword_count": keyword_count,
        "num_underbar": url.count("_"),
        "extract_consecutive_numbers": int(bool(re.findall(r'(\d)\1+', url))),
        "number": int(bool(len(re.findall(r'(\d)(?!\1)(\d)(?!\2)(\d)', url)))),
        "upper": int(any(c.isupper() for c in url)),

        "is_common_tld": is_common_tld,
        "is_country_tld": is_country_tld,
        "is_suspicious_tld": is_suspicious_tld,

        "domain_length": domain_length,
        "has_subdomain": has_subdomain,
        "subdomain_length": subdomain_length,
        "subdomain_count": subdomain_count,

        # "path_length": path_length,
        "path_depth": path_depth,
        "has_query": has_query,
        "query_length": query_length,
        "query_param_count": query_param_count,
        # "has_fragment": has_fragment,
        # "fragment_length": fragment_length,
        "url_shorteners": is_shortened,

        # 새로 추가된 특성
        "compression_ratio": compression_ratio(url),
        "check_similar_brand" : check_similar_brand(url),

        # Advanced text analysis
        "entropy": entropy,
        #"letter_ratio": letter_ratio,
        "digit_ratio": digit_ratio,
        "special_char_ratio": special_char_ratio


    }