-
Notifications
You must be signed in to change notification settings - Fork 74
Expand file tree
/
Copy pathocr.py
More file actions
96 lines (75 loc) · 3.91 KB
/
ocr.py
File metadata and controls
96 lines (75 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pytesseract
import unicodedata
import re
import numpy as np
# from PIL import Image, ImageFont, ImageDraw, ImageEnhance
# Define class variables
bounding_box_order = ["left", "top", "right", "bottom"]
# This method will take the model bounding box predictions and return the extracted text inside each box
def one_shot_ocr_service(image, output):
# iterate over detections
response = []
detections = output['bounding-boxes']
for i in range(len(detections)):
# crop image for every detection:
coordinates = (detections[i]["coordinates"])
cropped = image.crop((float(coordinates["left"]), float(
coordinates["top"]), float(coordinates["right"]), float(coordinates["bottom"])))
# convert image to grayscale for better accuracy
processed_img = cropped.convert('L')
# extract text with positive confidence from cropped image
df = pytesseract.image_to_data(processed_img, output_type='data.frame')
valid_df = df[df["conf"] > 0]
extracted_text = " ".join(valid_df["text"].values)
# process text
extracted_text = str(
unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n",
" ").replace(
"...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"',
'').replace(
"alt/1m", "").strip()
extracted_text = re.sub(
'[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
extracted_text = " ".join(extracted_text.split())
# wrap each prediction inside a dictionary
if len(extracted_text) is not 0:
prediction = {'text': extracted_text}
bounding_box = [coordinates[el] for el in bounding_box_order]
prediction["box"] = bounding_box
prediction["score"] = valid_df["conf"].mean() / 100.0
response.append(prediction)
return response
# This method will take an image and return the extracted text from the image
def ocr_service(image):
# convert image to grayscale for better accuracy
processed_img = image.convert('L')
# Get data including boxes, confidences, line and page numbers
df = pytesseract.image_to_data(processed_img, output_type='data.frame')
valid_df = df[df["conf"] > 0]
# process text
extracted_text = " ".join(valid_df["text"].values)
extracted_text = str(
unicodedata.normalize('NFKD', extracted_text).encode('ascii', 'ignore').decode()).strip().replace("\n",
" ").replace(
"...", ".").replace("..", ".").replace('”', ' ').replace('“', ' ').replace("'", ' ').replace('\"', '').replace(
"alt/1m", "").strip()
extracted_text = re.sub(
'[^A-Za-z0-9.!?,;%:=()\[\]$€&/\- ]+', '', extracted_text)
extracted_text = " ".join(extracted_text.split())
# calculate the bounding box data based on pytesseract results
coordinates = {}
index = valid_df.index.values
coordinates["left"] = valid_df.loc[index[0], "left"]
coordinates["top"] = valid_df.loc[index[0], "top"]
coordinates["bottom"] = valid_df.loc[index[-1],
"top"] + valid_df.loc[index[-1], "height"]
coordinates["right"] = valid_df.loc[index[-1],
"left"] + valid_df.loc[index[-1], "width"]
bounding_box = [coordinates[el].item() for el in bounding_box_order]
# wrap each prediction inside a dictionary
response = {
'text': extracted_text,
'box': bounding_box,
'score': valid_df["conf"].mean() / 100.0,
}
return [response]