Skip to content
This repository was archived by the owner on Sep 20, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions samples/snippets/batch_process_documents_processor_version_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_process_documents_processor_version]
import re

from google.api_core.client_options import ClientOptions
from google.cloud import documentai, storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Example: aeb8cea219b7c272
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Example: pretrained-ocr-v1.0-2020-09-23
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/


def batch_process_documents_processor_version(
project_id: str,
location: str,
processor_id: str,
processor_version_id: str,
gcs_input_uri: str,
input_mime_type: str,
gcs_output_bucket: str,
gcs_output_uri_prefix: str,
timeout: int = 300,
):

# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

gcs_document = documentai.GcsDocument(
gcs_uri=gcs_input_uri, mime_type=input_mime_type
)

# Load GCS Input URI into a List of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)

# NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
#
# gcs_input_uri = "gs://bucket/directory/"
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
#

# Cloud Storage URI for the Output Directory
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"

gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=destination_uri
)

# Where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

# The full resource name of the processor version
# e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)

request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)

# BatchProcess returns a Long Running Operation (LRO)
operation = client.batch_process_documents(request)

# Continually polls the operation until it is complete.
# This could take some time for larger files
# Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result(timeout=timeout)

# NOTE: Can also use callbacks for asynchronous processing
#
# def my_callback(future):
# result = future.result()
#
# operation.add_done_callback(my_callback)

# Once the operation is complete,
# get output document information from operation metadata
metadata = documentai.BatchProcessMetadata(operation.metadata)

if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Batch Process Failed: {metadata.state_message}")

storage_client = storage.Client()

print("Output files:")
# One process per Input Document
for process in metadata.individual_process_statuses:
# output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
# The Cloud Storage API requires the bucket name and URI prefix separately
matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
if not matches:
print(
"Could not parse output GCS destination:",
process.output_gcs_destination,
)
continue

output_bucket, output_prefix = matches.groups()

# Get List of Document Objects from the Output Bucket
output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

# Document AI may output multiple JSON files per source file
for blob in output_blobs:
# Document AI should only output JSON files to GCS
if ".json" not in blob.name:
print(
f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
)
continue

# Download JSON File as bytes object and convert to Document Object
print(f"Fetching {blob.name}")
document = documentai.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)

# For a full list of Document object attributes, please reference this page:
# https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document

# Read the text recognition output from the processor
print("The document contains the following text:")
print(document.text)


# [END documentai_batch_process_documents_processor_version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from uuid import uuid4

from google.cloud import storage
from google.cloud.exceptions import NotFound
import pytest
from samples.snippets import batch_process_documents_processor_version_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
processor_version_id = "pretrained-form-parser-v1.0-2020-09-23"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
gcs_output_uri_prefix = uuid4()
BUCKET_NAME = f"document-ai-python-{uuid4()}"


@pytest.fixture(scope="module")
def test_bucket():
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET_NAME)
yield bucket.name

try:
blobs = list(bucket.list_blobs())
for blob in blobs:
blob.delete()
bucket.delete()
except NotFound:
print("Bucket already deleted.")


def test_batch_process_documents_processor_version(capsys, test_bucket):
batch_process_documents_processor_version_sample.batch_process_documents_processor_version(
project_id=project_id,
location=location,
processor_id=processor_id,
processor_version_id=processor_version_id,
gcs_input_uri=gcs_input_uri,
input_mime_type=input_mime_type,
gcs_output_bucket=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
)
out, _ = capsys.readouterr()

assert "operation" in out
assert "Fetching" in out
assert "text:" in out
10 changes: 0 additions & 10 deletions samples/snippets/batch_process_documents_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
# processor_version = "pretrained" # Optional. Processor version to use
# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
Expand Down Expand Up @@ -73,17 +72,8 @@ def batch_process_documents(

# The full resource name of the processor, e.g.:
# projects/project_id/locations/location/processor/processor_id
# You must create new processors in the Cloud Console first
name = client.processor_path(project_id, location, processor_id)

# NOTE: Alternatively, specify the processor_version to specify a particular version of the processor to use
# projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processorVersion}
#
# name = client.processor_version_path(
# project_id, location, processor_id, processor_version
# )
#

request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ def test_batch_process_documents_with_bad_input(capsys):
out, _ = capsys.readouterr()
assert "Failed" in out
except Exception as e:
assert "Internal error" in e.message
assert "Failed" in e.message
58 changes: 58 additions & 0 deletions samples/snippets/delete_processor_version_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# [START documentai_delete_processor_version]

from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import FailedPrecondition, InvalidArgument
from google.cloud import documentai

# TODO(developer): Uncomment these variables before running the sample.
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
# processor_version_id = 'YOUR_PROCESSOR_VERSION_ID'


def delete_processor_version_sample(
project_id: str, location: str, processor_id: str, processor_version_id: str
):
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor version
# e.g.: projects/project_id/locations/location/processors/processor_id/processorVersions/processor_version_id
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)

# Make DeleteProcessorVersion request
try:
operation = client.delete_processor_version(name=name)
# Print operation details
print(operation.operation.name)
# Wait for operation to complete
operation.result()
# Delete request will fail if the
# processor version doesn't exist
# or if a request is made on a pretrained processor version
# or the default processor version
except (FailedPrecondition, InvalidArgument) as e:
print(e.message)


# [END documentai_delete_processor_version]
47 changes: 47 additions & 0 deletions samples/snippets/delete_processor_version_sample_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

import mock
from samples.snippets import delete_processor_version_sample

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "aaaaaaaaa"
processor_version_id = "xxxxxxxxxx"


@mock.patch(
"google.cloud.documentai.DocumentProcessorServiceClient.delete_processor_version"
)
@mock.patch("google.api_core.operation.Operation")
def test_delete_processor_version(
operation_mock, delete_processor_version_mock, capsys
):
delete_processor_version_mock.return_value = operation_mock

delete_processor_version_sample.delete_processor_version_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
processor_version_id=processor_version_id,
)

delete_processor_version_mock.assert_called_once()

out, _ = capsys.readouterr()

assert "operation" in out
Loading