forked from DEFI-COLaF/TheatreLFSV2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrtk-script.py
More file actions
71 lines (61 loc) · 2.11 KB
/
rtk-script.py
File metadata and controls
71 lines (61 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
""" This is a sample script for using RTK (Release the krakens)
It takes a file with a list of manifests to download from IIIF (See manifests.txt) and passes it in a suit of commands:
0. It downloads manifests and transform them into CSV files
1. It downloads images from the manifests
2. It applies YALTAi segmentation with line segmentation
3. It fixes up the image PATH of XML files
4. It processes the text as well through Kraken
5. It removes the image files (from the one hunder object that were meant to be done in group)
The batch file should be lower if you want to keep the space used low, specifically if you use DownloadIIIFManifest.
"""
from rtk.task import KrakenAltoCleanUpCommand, YALTAiCommand, KrakenRecognizerCommand, ExtractZoneAltoCommand
from rtk import utils
import glob
from sys import argv
folders = glob.glob("/home/thibault/dev/colaf-theatre/todo/books/*")
if len(argv) == 2:
num_workers = int(argv[1])
else:
num_workers = 5
for i in range(0, len(folders), 4):
batch = [
file
for folder in folders[i:i+4]
for file in glob.glob(f"{folder}/*.jpg")
]
# Apply YALTAi
print("[Task] Segment")
yaltai = YALTAiCommand(
batch,
binary="yaltaienv/bin/yaltai",
device="cuda:0",
yoloV5_model="LADaS.pt",
verbose=True,
raise_on_error=True,
allow_failure=False,
multiprocess=num_workers, # GPU Memory // 5gb
check_content=False
)
yaltai.process()
# Clean-up the relative filepath of Kraken Serialization
print("[Task] Clean-Up Serialization")
cleanup = KrakenAltoCleanUpCommand(yaltai.output_files)
cleanup.process()
# Apply Kraken
print("[Task] OCR")
kraken = KrakenRecognizerCommand(
yaltai.output_files,
binary="krakenv/bin/kraken",
device="cuda",
model="long-s.mlmodel",
multiprocess=14, # GPU Memory // 3gb
check_content=True # Required ?
)
kraken.process()
print("[Task] Extract")
task = ExtractZoneAltoCommand(
kraken.output_files,
zones=None,
fmt="tei"
)
task.process()