]>
Commit | Line | Data |
---|---|---|
162127f2 PMD |
1 | # ... |
2 | # | |
3 | # Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> | |
4 | # | |
5 | # This work is licensed under the terms of the GNU GPL, version 2 or | |
6 | # later. See the COPYING file in the top-level directory. | |
7 | ||
8 | import re | |
ca822449 | 9 | import logging |
162127f2 | 10 | |
ca822449 | 11 | from avocado.utils import process |
162127f2 PMD |
12 | from avocado.utils.path import find_command, CmdNotFoundError |
13 | ||
14 | def tesseract_available(expected_version): | |
15 | try: | |
16 | find_command('tesseract') | |
17 | except CmdNotFoundError: | |
18 | return False | |
19 | res = process.run('tesseract --version') | |
20 | try: | |
21 | version = res.stdout_text.split()[1] | |
22 | except IndexError: | |
23 | version = res.stderr_text.split()[1] | |
24 | return int(version.split('.')[0]) == expected_version | |
25 | ||
26 | match = re.match(r'tesseract\s(\d)', res) | |
27 | if match is None: | |
28 | return False | |
29 | # now this is guaranteed to be a digit | |
30 | return int(match.groups()[0]) == expected_version | |
ca822449 PMD |
31 | |
32 | ||
33 | def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): | |
34 | console_logger = logging.getLogger('tesseract') | |
35 | console_logger.debug(image_path) | |
36 | if tesseract_version == 4: | |
37 | tesseract_args += ' --oem 1' | |
38 | proc = process.run("tesseract {} {} stdout".format(tesseract_args, | |
39 | image_path)) | |
40 | lines = [] | |
41 | for line in proc.stdout_text.split('\n'): | |
42 | sline = line.strip() | |
43 | if len(sline): | |
44 | console_logger.debug(sline) | |
45 | lines += [sline] | |
46 | return lines |