问题:彩色的验证码,使用pytesseract识别出来的验证码内容一直是空字符串
原因:pytesseract只识别黑色部分的内容
解决办法:先把彩色图片精确转换成黑白图片。再将黑白图片进行反相,将验证码部分的内容变成黑色,背景变成白色的。
代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2019-11-204 T11:18:38.406Z
# @Author : HuangChang
import numpy as np
import cv2
from PIL import Image
import pytesseract
# 1、将彩色图片转换成黑白图片
##(1) read into bgr-space
img = cv2.imread("../screenshots/verification_code.png")
##(2) convert to hsv-space, then split the channels
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
##(3) threshold the S channel using adaptive method(`THRESH_OTSU`)
th, threshed = cv2.threshold(s, 100, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY)
##(4) print the thresh, and save the result
print("Thresh : {}".format(th))
cv2.imwrite("../screenshots/verification_code2.png", threshed)
# 2、将上步生成的黑白图片中的需要识别的内容改成黑色,即将图片进行黑白反相
img2 = cv2.imread("../screenshots/verification_code2.png")
height, width, channels = img2.shape
print("width:%s,height:%s,channels:%s" % (width, height, channels))
for row in range(height):
for list in range(width):
for c in range(channels):
pv = img2[row, list, c]
img2[row, list, c] = 255 - pv
cv2.imshow("AfterDeal", img2)
cv2.imwrite("../screenshots/verification_code3.png", img2)
# 3、读取验证码
img3 = cv2.imread("../screenshots/verification_code3.png")
code_str = pytesseract.image_to_string(img3, lang="eng", config="--psm 8")
print(''.join(code_str.split()))