态势感知-机器训练

0x01 svm模型训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
# author: Bing
# email: wulitouhaha@vip.qq.com

import sklearn, re
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn import metrics


# 判断特征维度
def get_evil_char(url):
# 判断恶意字符的出现的个数,做为一个维度
return len(re.findall("[<>,\'\"()/]", url, re.IGNORECASE))

def get_evil_word(url):
# 判断恶意函数的出现的个数,做为一个维度
return len(re.findall("(alert)|(script=)|(eval)|(src=)|(prompt)",url,re.IGNORECASE))

Dimensions = ["A", "N", "Z"] # A-< N-> Z-代表各种html标签;判断这个三个维度
def labels(Data):
#把参数泛化, 并获取词频
preData = [] #泛化后,数据
preCount = [] #泛化后,字母出现的次数;词频矩阵
for i in Data:
tt = i.replace("<", "A")
tt = tt.replace(">", "N")
tt = tt.replace("script", "Z")
preData.append(tt)
preCount.append([tt.count("A"), tt.count("N"), tt.count("Z")])
return preCount, preData

# 预测结果百分比
def do_metrics(y_test,y_pred):
print( "metrics.accuracy_score:" )
print( metrics.accuracy_score(y_test, y_pred) )
print( "metrics.confusion_matrix:" )
print( metrics.confusion_matrix(y_test, y_pred) )
print( "metrics.precision_score:" )
print( metrics.precision_score(y_test, y_pred) )
print( "metrics.recall_score:" )
print( metrics.recall_score(y_test, y_pred) )
print( "metrics.f1_score:" )
print( metrics.f1_score(y_test,y_pred) )

# 测试数据
badCase = [
"""<script></script>""",
""""><img src=# onerror=alert(/1/)>""",
"""<script>alert(11)</script>""",
]

goodCase = [
"""?te=2oildfml&test=sdfhk""",
""""te=292hsd%2342&test=sdf23hi9ehk==""",
"""te=2372987893&test=好的首肯定会""",
]

xss_list, xss_data = labels(badCase) # xss_list 为xss通过词频泛化出的一个词频矩阵; xss_data为泛化后的数据
safe_list, safe_data = labels(goodCase)
x = xss_list + safe_list

# 打上结果标记
safe_lable = [0 for i in range(0,len(goodCase))]
xss_lable = [1 for i in range(0,len(badCase))] # 黑名单数据打上1的标记
y = xss_lable + safe_lable


# 训练模型
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
y_Predict = clf.predict(x_test)

# --- SVM验证 ---
test = [
'AND 1=1',
'ORDER BY 1-- ',
'<script>alert(xss)</script>/',
'and (select substring(@@version,1,1))=\'X\'',
'www.baidu.com',
'<?php @eval($_POST[\'c\']);?>'
]
# 数据向量化
X_predict, X_data = labels(test)
# 模型预测
y_Predict = clf.predict(X_predict)
mapvalues = {1:'Bad ',0:'Good'}
for i in range(len(X_predict)):
print( mapvalues[y_Predict[i]] + ':' + test[i] )

# 保存训练模型
# joblib.dump(clf, "xss-svm-200000-module.m")
# 加载模型验证
# clf = joblib.load("xss-svm-200000-module.m")
# y_test = []
# y_test = clf.predict(x)
# print( metrics.accuracy_score(y_test,y) )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
# author: Bing
# email: wulitouhaha@vip.qq.com

import sklearn, re
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import metrics
from sklearn import preprocessing

x = []
y = []

# 判断特征维度
def get_evil_char(url):
# 判断符号的出现的个数,做为一个维度
return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))

def get_evil_word(url):
# 判断关键词的出现的个数,做为一个维度
return len(re.findall("(alert)|(script=)|(eval)|(src=)|(prompt)|(onerror)|(onload)|(onfocus)|(onmouseover)|(string.fromcharcode)|(document.cookie)|(%3c)|(%3e)|(%20)|(iframe)|(href)|(javascript)|(data)",url,re.IGNORECASE))

def get_feature(url):
return [ get_evil_char(url), get_evil_word(url) ]

def labels(filename,data,label):
with open(filename, "rb") as f:
for line in f:
data.append(get_feature(line.decode().strip()))
if label:
y.append(1)
else:
y.append(0)
return data

# 测试数据
labels('xss-200000.txt',x,1)
labels('good-xss-200000.txt',x,0)

# 训练模型
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
y_Predict = clf.predict(x_test)

# --- SVM验证 ---
test = [
'AND 1=1',
'ORDER BY 1-- ',
'<script>alert(xss)</script>/',
'and (select substring(@@version,1,1))=\'X\'',
'www.baidu.com',
'<?php @eval($_POST[\'c\']);?>'
]
# 数据向量化
z_test = []
for i in test:
z_test.append( get_feature( i ) )

# 模型预测
z_pred = clf.predict(z_test)
mapvalues = {1:'Bad ',0:'Good'}
for i in range(len(z_test)):
print( mapvalues[z_pred[i]] + ':' + test[i] )

# 保存训练模型
# joblib.dump(clf, "xss-svm-200000-module.m")
# 加载模型验证
# clf = joblib.load("xss-svm-200000-module.m")
# y_test = []
# y_test = clf.predict(x)
# print( metrics.accuracy_score(y_test,y) )

0x02 tf-idf模型训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
# author: Bing
# email: wulitouhaha@vip.qq.com

import numpy as np
import urllib

badCase = [
"""<script></script>""",
""""><img src=# onerror=alert(/1/)>""",
"""<script>alert(11)</script>""",
]

goodCase = [
"""?te=2oildfml&test=sdfhk""",
""""te=292hsd%2342&test=sdf23hi9ehk==""",
"""te=2372987893&test=好的首肯定会""",
]

# 数据预处理
def getQueryFromFile(filename='badqueries.txt'):
directory = "C:\\fwaf"
filepath = directory + "\\" + filename
data = open(filepath,'r').readlines()
data = list(set(data))
queries = set()
for d in data:
d = d.strip()
try:
d = str(urllib.unquote(d).decode('utf8'))   #converting url encoded data to simple string
queries.add(d)
except:
print('decode ' + d + ' error')
return list(queries)

badQueries = getQueryFromFile('badqueries.txt')
tempvalidQueries = getQueryFromFile('goodqueries.txt')
tempAllQueries = badQueries + tempvalidQueries

ybad = np.ones(len(badQueries))
ygood = np.zeros(len(tempvalidQueries))
y = np.hstack((ybad, ygood))

queries = tempAllQueries

# 构造3-gram特征,使用TF-IDF提取URL文本特征,并进行文本向量化
# tokenizer function, this will make 3 grams of each query
# 构造3-gram特征
def getNGrams(query):
tempQuery = str(query)
ngrams = []
for i in range(0,len(tempQuery)-3):
ngrams.append(tempQuery[i:i+3])
return ngrams


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split

# converting data to vectors
vectorizer = TfidfVectorizer(tokenizer=getNGrams)
# TF-IDF
X = vectorizer.fit_transform(queries)


#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
lgs = LogisticRegression()
lgs.fit(X_train, y_train) #training our model
print(lgs.score(X_test, y_test))  #checking the accuracy


X_predict = [
'AND 1=1',
'ORDER BY 1-- ',
'<script>alert(xss)</script>/',
'and (select substring(@@version,1,1))=\'X\'',
'www.baidu.com',
'<?php @eval($_POST[\'c\']);?>'
]

X_vecpredict = vectorizer.transform(X_predict)
y_Predict = lgs.predict(X_vecpredict)

#printing predicted values
mapvalues={1:'Bad ',0:'Good'}
for i in range(len(X_predict)):
print mapvalues[y_Predict[i]]+':'+X_predict[i]

0x03 模型验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
# author: Bing
# email: wulitouhaha@vip.qq.com

import re
from sklearn.externals import joblib

# 判断特征维度
def get_evil_char(url):
# 判断恶意字符的出现的个数,做为一个维度
return len(re.findall("[<>,\'\"()/]", url, re.IGNORECASE))

def get_evil_word(url):
# 判断恶意函数的出现的个数,做为一个维度
return len(re.findall("(alert)|(script=)|(eval)|(src=)|(prompt)|(onerror)|(onload)|(onfocus)|(onmouseover)|(string.fromcharcode)|(document.cookie)|(%3c)|(%3e)|(%20)|(iframe)|(href)|(javascript)|(data)",url,re.IGNORECASE))

def get_feature(url):
return [ get_evil_char(url), get_evil_word(url) ]

# 机器验证
test = [
'AND 1=1',
'ORDER BY 1-- ',
'<script>alert(xss)</script>/',
'and (select substring(@@version,1,1))=\'X\'',
'www.baidu.com',
'<?php @eval($_POST[\'c\']);?>'
]
# 数据向量化
z_test = []
for i in test:
z_test.append( get_feature( i ) )

# 模型预测
clf = joblib.load("xss-svm-200000-module.m")
z_pred = clf.predict(z_test)
mapvalues = {1:'Bad ',0:'Good'}
for i in range(len(z_test)):
print( mapvalues[z_pred[i]] + ':' + test[i] )

资源

https://github.com/0xbing/research/tree/master/ai

坚持原创技术分享,您的支持将鼓励我继续创作!