Skip to content

Commit 809d4ad

Browse files
committed
完善
1 parent b7764b9 commit 809d4ad

File tree

4 files changed

+126
-2
lines changed

4 files changed

+126
-2
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,9 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# VSCode
132+
.vscode/
133+
134+
# 下载的图片
135+
image/

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
# identification-codes-py
2-
由 Python 编写的超级简单的不怎么靠谱的验证码识别系统
1+
# Python 验证码识别
2+
3+
> 由 Python 编写的超级简单的不怎么靠谱的验证码识别系统
4+
5+
- 使用 PIL 处理验证码图片
6+
- 使用 ORC 方式识别验证码

program.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from PIL import Image
2+
import pytesseract, os
3+
from urllib.request import urlopen, Request
4+
5+
headers = {
6+
'Referer': 'http://chaxun.heyuanedu.cn:88/',
7+
'Accept': 'text/html,image/webp,image/png,image/jpeg,*/*;q=0.8',
8+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63',
9+
}
10+
11+
class GetCode:
12+
def __init__(self, path, url=None, headers=headers, origin='chaxun.heyuanedu.cn:88', threshold=185): # 初始化对象实例
13+
self.path = path
14+
self.url = url
15+
self.headers = headers
16+
self.origin = origin
17+
self.threshold = threshold
18+
19+
def getReq(self, url=None, headers=None, origin='chaxun.heyuanedu.cn:88'): # 获取 Request 对象
20+
self.url = url if url != None else self.url
21+
self.headers = headers if headers != None else self.headers
22+
self.origin = origin if origin != None else self.origin
23+
return Request(self.url, None, self.headers, self.origin)
24+
25+
def setting(self, path=None, url=None, headers=None, origin=None, threshold=None): # 设置实例的各类变量
26+
self.path = path if path != None else self.path
27+
self.url = url if url != None else self.url
28+
self.headers = headers if headers != None else self.headers
29+
self.origin = origin if origin != None else self.origin
30+
self.threshold = threshold if threshold != None else self.threshold
31+
32+
def getSession(self, url=None, headers=None, origin=None): # 获取对应系统的 Session ,注意:请勿重复调用!否则将可能导致不可预知的问题!
33+
self.url = url if url != None else self.url
34+
self.headers = headers if headers != None else self.headers
35+
self.origin = origin if origin != None else self.origin
36+
self.headers['Cookie'] = urlopen(self.getReq()).info()['Set-Cookie'].split(';')[0]
37+
return self.headers
38+
39+
def download(self, url=None, path=None, headers=None, origin=None): # 下载验证码图片
40+
self.url = url if url != None else self.url
41+
self.path = path if path != None else self.path
42+
self.headers = headers if headers != None else self.headers
43+
self.origin = origin if origin != None else self.origin
44+
if not os.path.exists(self.path[:self.path.rfind(os.sep)]): # 防止目录不存在出错
45+
os.mkdir(self.path[:self.path.rfind(os.sep)])
46+
with open(self.path, 'wb') as f:
47+
size = f.write(urlopen(self.getReq()).read())
48+
return size
49+
50+
def identify(self, path=None, threshold=None): # 识别验证码
51+
self.path = path if path != None else self.path
52+
self.threshold = threshold if threshold != None else self.threshold
53+
image = Image.open(self.path)
54+
pixdata = image.load()
55+
w, h = image.size
56+
for y in range(h): # 处理多余色彩
57+
for x in range(w):
58+
if pixdata[x, y][0] < self.threshold and pixdata[x, y][1] < self.threshold and pixdata[x, y][2] < self.threshold:
59+
pixdata[x, y] = (0, 0, 0)
60+
else:
61+
pixdata[x, y] = (255, 255, 255)
62+
for y in range(1,h-1): # 处理干扰线
63+
for x in range(1,w-1):
64+
count = 0
65+
if pixdata[x,y-1][0] > 245:
66+
count = count + 1
67+
if pixdata[x,y+1][0] > 245:
68+
count = count + 1
69+
if pixdata[x-1,y][0] > 245:
70+
count = count + 1
71+
if pixdata[x+1,y][0] > 245:
72+
count = count + 1
73+
if count > 2:
74+
pixdata[x,y] = (255,255,255)
75+
image.save(self.path[:self.path.rfind(os.sep)+1] + 'code-Pretreatment.png') # 保存预处理图片(非必须,生产环境建议删除)
76+
return pytesseract.image_to_string(image, 'eng')

test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
print('preparing...')
2+
3+
from program import GetCode, headers
4+
from bs4 import BeautifulSoup
5+
from urllib.request import Request, urlopen
6+
from urllib.parse import urlencode
7+
import re, os
8+
9+
ele0 = ''
10+
p = re.compile(r'(\d)[^0-9]*(\d)[^0-9]*(\d)[^0-9]*(\d)') # 此系统验证码正则
11+
12+
getCode = GetCode('image%scode.png'%os.sep, 'http://chaxun.heyuanedu.cn:88/validatecode.php?act=getimg') # 实例化对象
13+
getCode.getSession() # 获取 Session
14+
15+
userName = input('姓名:')
16+
identity = input('准考证号:')
17+
18+
while ele0 == '': # 识别通过正则但错误循环
19+
result2 = None
20+
while result2 == None: # 识别无法通过正则循环
21+
#time.sleep(0.2)
22+
print('Downloading...')
23+
getCode.download()
24+
print('Identifying...')
25+
result = getCode.identify()
26+
result2 = ('%s%s%s%s'%p.search(result).groups() if p.search(result) != None else None)
27+
result0 = (result2 if result2 != None else result) if result != '' else 'failure!'
28+
print('Result: ' + result0)
29+
30+
req = Request('http://chaxun.heyuanedu.cn:88/search.php', urlencode({ 'userName': userName, 'identity': identity, 'code': result0 }).encode(), headers, 'chaxun.heyuanedu.cn:88')
31+
res = urlopen(req) # 获取结果
32+
html = res.read().decode()
33+
#print(res.headers)
34+
#print(html)
35+
soup = BeautifulSoup(html,'lxml') # 解析 html
36+
ele = soup.select('#printGrade')
37+
ele0 = str(ele)[1:-1]
38+
print(ele0 if ele0 != '' else 'Error!')

0 commit comments

Comments
 (0)