### Tags
#AI/OCR #Python #OCR
---
### Citation (if applicable)
----
### Purpose of Code
The code was developed using Google AI Studio and Gemini 2.5 Pro for performing OCR on 16th- 19th century manuscripts obtained from the Internet Archive. The code extracts the texts and places them into a JSON file in increments of 100 pages. This is the first attempt and 4th iteration of the code.
____
### Code
```python
from google.cloud import vision
import time
import re # For parsing GCS output path
# You might need to install the storage library if you haven't:
# In Terminal, run: pip3 install google-cloud-storage
def async_detect_document_text_gcs(gcs_source_uri, gcs_destination_uri, language_code="la", timeout_seconds=600):
"""Performs asynchronous OCR on a document stored in GCS.
Args:
gcs_source_uri: The GCS URI of the input PDF or TIFF file (e.g., "gs://bucket/file.pdf").
gcs_destination_uri: The GCS URI where the JSON output will be stored
(e.g., "gs://bucket/ocr_results/"). Ensure this folder exists or
the service account has permissions to create it.
Output files will be named like: output-1-to-N.json
language_code: Language hint for OCR.
timeout_seconds: Seconds to wait for the operation to complete.
"""
client = vision.ImageAnnotatorClient()
# Determine mime_type from file extension
if gcs_source_uri.lower().endswith(".pdf"):
mime_type = 'application/pdf'
elif gcs_source_uri.lower().endswith((".tiff", ".tif")):
mime_type = 'image/tiff'
else:
raise ValueError("Unsupported file type. Only PDF and TIFF are supported for GCS async.")
gcs_source = vision.GcsSource(uri=gcs_source_uri)
# How many pages should be grouped into each json output file.
# For DOCUMENT_TEXT_DETECTION, the API often handles batching internally per file,
# but this setting can influence how results are grouped if a feature supports it.
# For DOCUMENT_TEXT_DETECTION on a single PDF, output is usually one result file per input PDF.
# The `batch_size` in `output_config` mainly controls how many pages from the input document
# are grouped into *each individual JSON output shard* if the service splits a large output.
# Vision API will produce one or more JSON files in the output GCS location.
# Each JSON file can contain results for up to `batch_size` pages.
output_config_batch_size = 100 # Number of pages per output JSON file shard
feature = vision.Feature(
type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION,
)
image_context = vision.ImageContext(language_hints=[language_code])
input_config = vision.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
output_config = vision.OutputConfig(
gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
batch_size=output_config_batch_size # Pages per output JSON shard
)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature],
input_config=input_config,
output_config=output_config, # Note: this was output_config in earlier versions, now it's inside AsyncAnnotateFileRequest
image_context=image_context
)
print(f"Submitting OCR request for {gcs_source_uri} with language {language_code}")
# For async_batch_annotate_files, the argument is 'requests' (plural)
operation = client.async_batch_annotate_files(
requests=[async_request]) # The request is a list containing one item
print(f'Waiting for the operation to finish (this can take up to {timeout_seconds} seconds)...')
# operation.result(timeout=timeout_seconds) # This blocks until completion or timeout
# ... (polling loop as before, or just use operation.result()) ...
# Replace the problematic if/elif/else block with:
try:
print(f"Waiting for operation '{operation.operation.name}' to complete (up to {timeout_seconds}s)...")
operation.result(timeout=timeout_seconds) # This blocks and will raise on LRO failure or timeout
print(f'OCR processing operation itself completed. Checking for output files at {gcs_destination_uri}.')
# If result() doesn't raise, the LRO is done. The actual success of OCR is confirmed by output files.
except TimeoutError:
print(f"Operation timed out after {timeout_seconds} seconds. Output might not be generated.")
return # Stop if timed out
except Exception as e:
print(f"Operation failed or encountered an error: {e}")
# You could check operation.error_message or operation.metadata here for more details
if hasattr(operation, 'error_message') and operation.error_message:
print(f"Error details: {operation.error_message}")
print("Please check the Google Cloud Console for operation status and logs.")
return # Stop if there was an LRO error
# If we reach here, the LRO completed without raising an immediate exception.
# Now, proceed to list output files from GCS. The presence of these files is the true indicator
# of successful OCR processing for batch operations.
print(f"\nAttempting to list output files in {gcs_destination_uri}...")
# ... (rest of your GCS file listing code) ...
# The results are not directly returned by the operation.
# They are written to the GCS destination URI.
# You'll need to download and parse these JSON files from GCS.
print(f"\nAttempting to list output files in {gcs_destination_uri}...")
try:
from google.cloud import storage
storage_client = storage.Client()
# Parse GCS URI
# gs://bucket-name/path/to/output/
# The prefix should be 'path/to/output/'
if not gcs_destination_uri.startswith("gs://"):
raise ValueError("GCS URI must start with gs://")
path_parts = gcs_destination_uri[5:].split("/", 1)
bucket_name = path_parts[0]
prefix = ""
if len(path_parts) > 1:
prefix = path_parts[1]
if not prefix.endswith('/'): # ensure prefix ends with a slash if it's a folder
prefix += '/'
bucket = storage_client.bucket(bucket_name)
print(f"--- Contents of OCR output at gs://{bucket_name}/{prefix} ---")
output_files_found = False
for blob in bucket.list_blobs(prefix=prefix):
# The actual output files might have a specific prefix like "output-"
# and a suffix like ".json"
if blob.name.endswith('.json'): # Process only JSON output files
output_files_found = True
print(f"Found result file: gs://{bucket_name}/{blob.name}")
# To process the content:
# json_string = blob.download_as_string()
# data = json.loads(json_string) # This will be a list of responses if batch_size > 1
# For DOCUMENT_TEXT_DETECTION, it's usually one response object per file (or per set of pages)
# if data and 'responses' in data:
# for response_item in data['responses']:
# if 'fullTextAnnotation' in response_item:
# print(f" Text from {blob.name}: {response_item['fullTextAnnotation']['text'][:100]}...") # Print first 100 chars
# else:
# # It might be a direct FullTextAnnotation object if only one page/doc
# if 'fullTextAnnotation' in data:
# print(f" Text from {blob.name}: {data['fullTextAnnotation']['text'][:100]}...")
if not output_files_found:
print(f"No JSON output files found in gs://{bucket_name}/{prefix}. \n"
f"This might be okay if the operation just finished; files can take a moment to appear. \n"
f"Otherwise, check operation status in Google Cloud Console and permissions.")
except ImportError:
print("Python library 'google-cloud-storage' not found. Please install it to list GCS output files: \n"
"In Terminal, run: pip3 install google-cloud-storage")
except Exception as e:
print(f"Error listing/processing GCS output: {e}")
import traceback
traceback.print_exc()
if __name__ == '__main__':
# === MODIFY THESE LINES BELOW ===
# 1. Replace with YOUR Google Cloud Storage bucket name and path to YOUR PDF file
gcs_input_uri = 'gs://coin_ocr_bucket1/input_manuscripts/1743 Numismata Imperatorum Romanorum.pdf'
# 2. Replace with YOUR Google Cloud Storage bucket name and desired output folder path
# Ensure this output folder exists in your bucket, or that the service account
# has permissions to create it. IT MUST END WITH A SLASH (/).
gcs_output_uri = 'gs://coin_ocr_bucket1/Output_Results/'
# 3. Set the language code (e.g., "la" for Latin)
manuscript_language = "la"
# 4. (Optional) Adjust timeout in seconds if your PDFs are very large (e.g., many hundreds of pages)
processing_timeout_seconds = 1800 # 30 minutes
print(f"Starting OCR for: {gcs_input_uri}")
print(f"Output will be saved to: {gcs_output_uri}")
print(f"Language hint: {manuscript_language}")
# Before running, ensure you have installed the necessary library for GCS interaction:
# In Terminal, run: pip3 install google-cloud-storage
try:
async_detect_document_text_gcs(gcs_input_uri, gcs_output_uri,
language_code=manuscript_language,
timeout_seconds=processing_timeout_seconds)
except ValueError as ve:
print(f"Configuration Error: {ve}")
except Exception as e:
print(f"An unexpected error occurred during GCS processing: {e}")
import traceback
traceback.print_exc()
```
____
### Challenges:
[[OCR, Google Vision, and Google AI Studio#Challenges]] *This link discusses the challenges of developing Python code using Google Gemini in AI Studio*
### Next Steps:
1) clean the code
2) Explore exporting code into plain text (.txt) instead of .json files.
3) [[OCR, Google Vision, and Google AI Studio#Thoughts on where to go next]] *Next steps for OCR and dissertation*