30 lines
1.2 KiB
Python
30 lines
1.2 KiB
Python
import re
|
|
|
|
def extract_url_features(url):
|
|
suspicious_words = [
|
|
'login', 'verify', 'update', 'confirm',
|
|
'account', 'secure', 'ebayisapi', 'banking'
|
|
]
|
|
|
|
return {
|
|
'digit_count' : len(re.findall(r'\d', url)),
|
|
'dash_count' : url.count('-'),
|
|
'underscore_count' : url.count('_'),
|
|
'percent_count' : url.count('%'),
|
|
'equal_count' : url.count('='),
|
|
'question_count' : url.count('?'),
|
|
'at_count' : url.count('@'),
|
|
'count_of_exclamation' : url.count('!'),
|
|
'count_of_dot' : url.count('.'),
|
|
'count_of_double_slash' : url.count('//'),
|
|
'special_char_count' : len(re.findall(r'[^a-zA-Z0-9]', url)),
|
|
'is_ip_in_url' : bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
|
|
'has_www' : 'www' in url,
|
|
'suspicious_word_count' : sum(word in url.lower() for word in suspicious_words),
|
|
'path_depth' : url.count('/') - 2,
|
|
'has_long_digit_sequence' : bool(re.search(r'\d{4,}', url)),
|
|
'has_multiple_dash' : bool(re.search(r'-{2,}', url)),
|
|
'has_https' : url.startswith('https'),
|
|
'ends_with_common_extension' : url.endswith(('.html', '.php'))
|
|
}
|