Upload files to "Nam"
This commit is contained in:
@@ -1,4 +1,11 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
from scipy.stats import entropy
|
||||
|
||||
def calculate_url_entropy(url):
|
||||
counter = Counter(url)
|
||||
probabilities = [count / len(url) for count in counter.values()]
|
||||
return entropy(probabilities, base=2)
|
||||
|
||||
def extract_url_features(url):
|
||||
suspicious_words = [
|
||||
@@ -7,23 +14,25 @@ def extract_url_features(url):
|
||||
]
|
||||
|
||||
return {
|
||||
'digit_count' : len(re.findall(r'\d', url)),
|
||||
'dash_count' : url.count('-'),
|
||||
'underscore_count' : url.count('_'),
|
||||
'percent_count' : url.count('%'),
|
||||
'equal_count' : url.count('='),
|
||||
'question_count' : url.count('?'),
|
||||
'at_count' : url.count('@'),
|
||||
'count_of_exclamation' : url.count('!'),
|
||||
'count_of_dot' : url.count('.'),
|
||||
'count_of_double_slash' : url.count('//'),
|
||||
'special_char_count' : len(re.findall(r'[^a-zA-Z0-9]', url)),
|
||||
'is_ip_in_url' : bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
|
||||
'has_www' : 'www' in url,
|
||||
'suspicious_word_count' : sum(word in url.lower() for word in suspicious_words),
|
||||
'path_depth' : url.count('/') - 2,
|
||||
'has_long_digit_sequence' : bool(re.search(r'\d{4,}', url)),
|
||||
'has_multiple_dash' : bool(re.search(r'-{2,}', url)),
|
||||
'has_https' : url.startswith('https'),
|
||||
'ends_with_common_extension' : url.endswith(('.html', '.php'))
|
||||
'digit_count': len(re.findall(r'\d', url)),
|
||||
'dash_count': url.count('-'),
|
||||
'underscore_count': url.count('_'),
|
||||
'percent_count': url.count('%'),
|
||||
'equal_count': url.count('='),
|
||||
'question_count': url.count('?'),
|
||||
'at_count': url.count('@'),
|
||||
'count_of_exclamation': url.count('!'),
|
||||
'count_of_dot': url.count('.'),
|
||||
'count_of_double_slash': url.count('//'),
|
||||
'special_char_count': len(re.findall(r'[^a-zA-Z0-9]', url)),
|
||||
'is_ip_in_url': bool(re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', url)),
|
||||
'has_www': 'www' in url,
|
||||
'suspicious_word_count': sum(word in url.lower() for word in suspicious_words),
|
||||
'path_depth': url.count('/') - 2,
|
||||
'has_long_digit_sequence': bool(re.search(r'\d{4,}', url)),
|
||||
'has_multiple_dash': bool(re.search(r'-{2,}', url)),
|
||||
'has_https': url.startswith('https'),
|
||||
'ends_with_common_extension': url.endswith(('.html', '.php')),
|
||||
'url_length': len(url), # ✅ 추가
|
||||
'url_entropy': calculate_url_entropy(url) # ✅ 추가
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user