### Tags #AI/OCR #Python #OCR --- ### Citation (if applicable) ---- ### Purpose of Code The code was developed using Google AI Studio and Gemini 2.5 Pro for performing OCR on 16th- 19th century manuscripts obtained from the Internet Archive. The code extracts the texts and places them into a JSON file in increments of 100 pages. This is the first attempt and 4th iteration of the code. ____ ### Code ```python from google.cloud import vision import time import re # For parsing GCS output path # You might need to install the storage library if you haven't: # In Terminal, run: pip3 install google-cloud-storage def async_detect_document_text_gcs(gcs_source_uri, gcs_destination_uri, language_code="la", timeout_seconds=600): """Performs asynchronous OCR on a document stored in GCS. Args: gcs_source_uri: The GCS URI of the input PDF or TIFF file (e.g., "gs://bucket/file.pdf"). gcs_destination_uri: The GCS URI where the JSON output will be stored (e.g., "gs://bucket/ocr_results/"). Ensure this folder exists or the service account has permissions to create it. Output files will be named like: output-1-to-N.json language_code: Language hint for OCR. timeout_seconds: Seconds to wait for the operation to complete. """ client = vision.ImageAnnotatorClient() # Determine mime_type from file extension if gcs_source_uri.lower().endswith(".pdf"): mime_type = 'application/pdf' elif gcs_source_uri.lower().endswith((".tiff", ".tif")): mime_type = 'image/tiff' else: raise ValueError("Unsupported file type. Only PDF and TIFF are supported for GCS async.") gcs_source = vision.GcsSource(uri=gcs_source_uri) # How many pages should be grouped into each json output file. # For DOCUMENT_TEXT_DETECTION, the API often handles batching internally per file, # but this setting can influence how results are grouped if a feature supports it. # For DOCUMENT_TEXT_DETECTION on a single PDF, output is usually one result file per input PDF. # The `batch_size` in `output_config` mainly controls how many pages from the input document # are grouped into *each individual JSON output shard* if the service splits a large output. # Vision API will produce one or more JSON files in the output GCS location. # Each JSON file can contain results for up to `batch_size` pages. output_config_batch_size = 100 # Number of pages per output JSON file shard feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION, ) image_context = vision.ImageContext(language_hints=[language_code]) input_config = vision.InputConfig( gcs_source=gcs_source, mime_type=mime_type) output_config = vision.OutputConfig( gcs_destination=vision.GcsDestination(uri=gcs_destination_uri), batch_size=output_config_batch_size # Pages per output JSON shard ) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config, # Note: this was output_config in earlier versions, now it's inside AsyncAnnotateFileRequest image_context=image_context ) print(f"Submitting OCR request for {gcs_source_uri} with language {language_code}") # For async_batch_annotate_files, the argument is 'requests' (plural) operation = client.async_batch_annotate_files( requests=[async_request]) # The request is a list containing one item print(f'Waiting for the operation to finish (this can take up to {timeout_seconds} seconds)...') # operation.result(timeout=timeout_seconds) # This blocks until completion or timeout # ... (polling loop as before, or just use operation.result()) ... # Replace the problematic if/elif/else block with: try: print(f"Waiting for operation '{operation.operation.name}' to complete (up to {timeout_seconds}s)...") operation.result(timeout=timeout_seconds) # This blocks and will raise on LRO failure or timeout print(f'OCR processing operation itself completed. Checking for output files at {gcs_destination_uri}.') # If result() doesn't raise, the LRO is done. The actual success of OCR is confirmed by output files. except TimeoutError: print(f"Operation timed out after {timeout_seconds} seconds. Output might not be generated.") return # Stop if timed out except Exception as e: print(f"Operation failed or encountered an error: {e}") # You could check operation.error_message or operation.metadata here for more details if hasattr(operation, 'error_message') and operation.error_message: print(f"Error details: {operation.error_message}") print("Please check the Google Cloud Console for operation status and logs.") return # Stop if there was an LRO error # If we reach here, the LRO completed without raising an immediate exception. # Now, proceed to list output files from GCS. The presence of these files is the true indicator # of successful OCR processing for batch operations. print(f"\nAttempting to list output files in {gcs_destination_uri}...") # ... (rest of your GCS file listing code) ... # The results are not directly returned by the operation. # They are written to the GCS destination URI. # You'll need to download and parse these JSON files from GCS. print(f"\nAttempting to list output files in {gcs_destination_uri}...") try: from google.cloud import storage storage_client = storage.Client() # Parse GCS URI # gs://bucket-name/path/to/output/ # The prefix should be 'path/to/output/' if not gcs_destination_uri.startswith("gs://"): raise ValueError("GCS URI must start with gs://") path_parts = gcs_destination_uri[5:].split("/", 1) bucket_name = path_parts[0] prefix = "" if len(path_parts) > 1: prefix = path_parts[1] if not prefix.endswith('/'): # ensure prefix ends with a slash if it's a folder prefix += '/' bucket = storage_client.bucket(bucket_name) print(f"--- Contents of OCR output at gs://{bucket_name}/{prefix} ---") output_files_found = False for blob in bucket.list_blobs(prefix=prefix): # The actual output files might have a specific prefix like "output-" # and a suffix like ".json" if blob.name.endswith('.json'): # Process only JSON output files output_files_found = True print(f"Found result file: gs://{bucket_name}/{blob.name}") # To process the content: # json_string = blob.download_as_string() # data = json.loads(json_string) # This will be a list of responses if batch_size > 1 # For DOCUMENT_TEXT_DETECTION, it's usually one response object per file (or per set of pages) # if data and 'responses' in data: # for response_item in data['responses']: # if 'fullTextAnnotation' in response_item: # print(f" Text from {blob.name}: {response_item['fullTextAnnotation']['text'][:100]}...") # Print first 100 chars # else: # # It might be a direct FullTextAnnotation object if only one page/doc # if 'fullTextAnnotation' in data: # print(f" Text from {blob.name}: {data['fullTextAnnotation']['text'][:100]}...") if not output_files_found: print(f"No JSON output files found in gs://{bucket_name}/{prefix}. \n" f"This might be okay if the operation just finished; files can take a moment to appear. \n" f"Otherwise, check operation status in Google Cloud Console and permissions.") except ImportError: print("Python library 'google-cloud-storage' not found. Please install it to list GCS output files: \n" "In Terminal, run: pip3 install google-cloud-storage") except Exception as e: print(f"Error listing/processing GCS output: {e}") import traceback traceback.print_exc() if __name__ == '__main__': # === MODIFY THESE LINES BELOW === # 1. Replace with YOUR Google Cloud Storage bucket name and path to YOUR PDF file gcs_input_uri = 'gs://coin_ocr_bucket1/input_manuscripts/1743 Numismata Imperatorum Romanorum.pdf' # 2. Replace with YOUR Google Cloud Storage bucket name and desired output folder path # Ensure this output folder exists in your bucket, or that the service account # has permissions to create it. IT MUST END WITH A SLASH (/). gcs_output_uri = 'gs://coin_ocr_bucket1/Output_Results/' # 3. Set the language code (e.g., "la" for Latin) manuscript_language = "la" # 4. (Optional) Adjust timeout in seconds if your PDFs are very large (e.g., many hundreds of pages) processing_timeout_seconds = 1800 # 30 minutes print(f"Starting OCR for: {gcs_input_uri}") print(f"Output will be saved to: {gcs_output_uri}") print(f"Language hint: {manuscript_language}") # Before running, ensure you have installed the necessary library for GCS interaction: # In Terminal, run: pip3 install google-cloud-storage try: async_detect_document_text_gcs(gcs_input_uri, gcs_output_uri, language_code=manuscript_language, timeout_seconds=processing_timeout_seconds) except ValueError as ve: print(f"Configuration Error: {ve}") except Exception as e: print(f"An unexpected error occurred during GCS processing: {e}") import traceback traceback.print_exc() ``` ____ ### Challenges: [[OCR, Google Vision, and Google AI Studio#Challenges]] *This link discusses the challenges of developing Python code using Google Gemini in AI Studio* ### Next Steps: 1) clean the code 2) Explore exporting code into plain text (.txt) instead of .json files. 3) [[OCR, Google Vision, and Google AI Studio#Thoughts on where to go next]] *Next steps for OCR and dissertation*