Skip to content

Commit f008893

Browse files
gcf-owl-bot[bot]partheaohmayr
authored
feat: [google-cloud-documentai] Support a new Layout Processor in Document AI (#12541)
BEGIN_COMMIT_OVERRIDE feat: Support a new Layout Processor in Document AI docs: keep the API doc up-to-date with recent changes END_COMMIT_OVERRIDE - [ ] Regenerate this pull request now. docs: keep the API doc up-to-date with recent changes PiperOrigin-RevId: 621233157 Source-Link: googleapis/googleapis@d5020ff Source-Link: googleapis/googleapis-gen@3beacfd Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLWRvY3VtZW50YWkvLk93bEJvdC55YW1sIiwiaCI6IjNiZWFjZmQwMmY4Y2Y2NTBiYmFlNmVmOGMzNzEzMWM5ODcyM2ZhMTcifQ== --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com> Co-authored-by: ohmayr <omairnaveed@ymail.com>
1 parent c0a0bf6 commit f008893

File tree

4 files changed

+390
-5
lines changed

4 files changed

+390
-5
lines changed

packages/google-cloud-documentai/google/cloud/documentai_v1beta3/types/document.py

Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ class Document(proto.Message):
104104
revisions (MutableSequence[google.cloud.documentai_v1beta3.types.Document.Revision]):
105105
Placeholder. Revision history of this
106106
document.
107+
document_layout (google.cloud.documentai_v1beta3.types.Document.DocumentLayout):
108+
Parsed layout of the document.
109+
chunked_document (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument):
110+
Document chunked based on chunking config.
107111
"""
108112

109113
class ShardInfo(proto.Message):
@@ -1811,6 +1815,317 @@ class TextChange(proto.Message):
18111815
message="Document.Provenance",
18121816
)
18131817

1818+
class DocumentLayout(proto.Message):
1819+
r"""Represents the parsed layout of a document as a collection of
1820+
blocks that the document is divided into.
1821+
1822+
Attributes:
1823+
blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]):
1824+
List of blocks in the document.
1825+
"""
1826+
1827+
class DocumentLayoutBlock(proto.Message):
1828+
r"""Represents a block. A block could be one of the various types
1829+
(text, table, list) supported.
1830+
1831+
This message has `oneof`_ fields (mutually exclusive fields).
1832+
For each oneof, at most one member field can be set at the same time.
1833+
Setting any member of the oneof automatically clears all other
1834+
members.
1835+
1836+
.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields
1837+
1838+
Attributes:
1839+
text_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock):
1840+
Block consisting of text content.
1841+
1842+
This field is a member of `oneof`_ ``block``.
1843+
table_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock):
1844+
Block consisting of table content/structure.
1845+
1846+
This field is a member of `oneof`_ ``block``.
1847+
list_block (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock):
1848+
Block consisting of list content/structure.
1849+
1850+
This field is a member of `oneof`_ ``block``.
1851+
block_id (str):
1852+
ID of the block.
1853+
page_span (google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan):
1854+
Page span of the block.
1855+
"""
1856+
1857+
class LayoutPageSpan(proto.Message):
1858+
r"""Represents where the block starts and ends in the document.
1859+
1860+
Attributes:
1861+
page_start (int):
1862+
Page where block starts in the document.
1863+
page_end (int):
1864+
Page where block ends in the document.
1865+
"""
1866+
1867+
page_start: int = proto.Field(
1868+
proto.INT32,
1869+
number=1,
1870+
)
1871+
page_end: int = proto.Field(
1872+
proto.INT32,
1873+
number=2,
1874+
)
1875+
1876+
class LayoutTextBlock(proto.Message):
1877+
r"""Represents a text type block.
1878+
1879+
Attributes:
1880+
text (str):
1881+
Text content stored in the block.
1882+
type_ (str):
1883+
Type of the text in the block. Available options are:
1884+
``paragraph``, ``subtitle``, ``heading-1``, ``heading-2``,
1885+
``heading-3``, ``heading-4``, ``heading-5``, ``header``,
1886+
``footer``.
1887+
blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]):
1888+
A text block could further have child blocks.
1889+
Repeated blocks support further hierarchies and
1890+
nested blocks.
1891+
"""
1892+
1893+
text: str = proto.Field(
1894+
proto.STRING,
1895+
number=1,
1896+
)
1897+
type_: str = proto.Field(
1898+
proto.STRING,
1899+
number=2,
1900+
)
1901+
blocks: MutableSequence[
1902+
"Document.DocumentLayout.DocumentLayoutBlock"
1903+
] = proto.RepeatedField(
1904+
proto.MESSAGE,
1905+
number=3,
1906+
message="Document.DocumentLayout.DocumentLayoutBlock",
1907+
)
1908+
1909+
class LayoutTableBlock(proto.Message):
1910+
r"""Represents a table type block.
1911+
1912+
Attributes:
1913+
header_rows (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow]):
1914+
Header rows at the top of the table.
1915+
body_rows (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow]):
1916+
Body rows containing main table content.
1917+
caption (str):
1918+
Table caption/title.
1919+
"""
1920+
1921+
header_rows: MutableSequence[
1922+
"Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow"
1923+
] = proto.RepeatedField(
1924+
proto.MESSAGE,
1925+
number=1,
1926+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow",
1927+
)
1928+
body_rows: MutableSequence[
1929+
"Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow"
1930+
] = proto.RepeatedField(
1931+
proto.MESSAGE,
1932+
number=2,
1933+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableRow",
1934+
)
1935+
caption: str = proto.Field(
1936+
proto.STRING,
1937+
number=3,
1938+
)
1939+
1940+
class LayoutTableRow(proto.Message):
1941+
r"""Represents a row in a table.
1942+
1943+
Attributes:
1944+
cells (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell]):
1945+
A table row is a list of table cells.
1946+
"""
1947+
1948+
cells: MutableSequence[
1949+
"Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell"
1950+
] = proto.RepeatedField(
1951+
proto.MESSAGE,
1952+
number=1,
1953+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableCell",
1954+
)
1955+
1956+
class LayoutTableCell(proto.Message):
1957+
r"""Represents a cell in a table row.
1958+
1959+
Attributes:
1960+
blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]):
1961+
A table cell is a list of blocks.
1962+
Repeated blocks support further hierarchies and
1963+
nested blocks.
1964+
row_span (int):
1965+
How many rows this cell spans.
1966+
col_span (int):
1967+
How many columns this cell spans.
1968+
"""
1969+
1970+
blocks: MutableSequence[
1971+
"Document.DocumentLayout.DocumentLayoutBlock"
1972+
] = proto.RepeatedField(
1973+
proto.MESSAGE,
1974+
number=1,
1975+
message="Document.DocumentLayout.DocumentLayoutBlock",
1976+
)
1977+
row_span: int = proto.Field(
1978+
proto.INT32,
1979+
number=2,
1980+
)
1981+
col_span: int = proto.Field(
1982+
proto.INT32,
1983+
number=3,
1984+
)
1985+
1986+
class LayoutListBlock(proto.Message):
1987+
r"""Represents a list type block.
1988+
1989+
Attributes:
1990+
list_entries (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry]):
1991+
List entries that constitute a list block.
1992+
type_ (str):
1993+
Type of the list_entries (if exist). Available options are
1994+
``ordered`` and ``unordered``.
1995+
"""
1996+
1997+
list_entries: MutableSequence[
1998+
"Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry"
1999+
] = proto.RepeatedField(
2000+
proto.MESSAGE,
2001+
number=1,
2002+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutListEntry",
2003+
)
2004+
type_: str = proto.Field(
2005+
proto.STRING,
2006+
number=2,
2007+
)
2008+
2009+
class LayoutListEntry(proto.Message):
2010+
r"""Represents an entry in the list.
2011+
2012+
Attributes:
2013+
blocks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.DocumentLayout.DocumentLayoutBlock]):
2014+
A list entry is a list of blocks.
2015+
Repeated blocks support further hierarchies and
2016+
nested blocks.
2017+
"""
2018+
2019+
blocks: MutableSequence[
2020+
"Document.DocumentLayout.DocumentLayoutBlock"
2021+
] = proto.RepeatedField(
2022+
proto.MESSAGE,
2023+
number=1,
2024+
message="Document.DocumentLayout.DocumentLayoutBlock",
2025+
)
2026+
2027+
text_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock" = proto.Field(
2028+
proto.MESSAGE,
2029+
number=2,
2030+
oneof="block",
2031+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTextBlock",
2032+
)
2033+
table_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock" = proto.Field(
2034+
proto.MESSAGE,
2035+
number=3,
2036+
oneof="block",
2037+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutTableBlock",
2038+
)
2039+
list_block: "Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock" = proto.Field(
2040+
proto.MESSAGE,
2041+
number=4,
2042+
oneof="block",
2043+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutListBlock",
2044+
)
2045+
block_id: str = proto.Field(
2046+
proto.STRING,
2047+
number=1,
2048+
)
2049+
page_span: "Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan" = proto.Field(
2050+
proto.MESSAGE,
2051+
number=5,
2052+
message="Document.DocumentLayout.DocumentLayoutBlock.LayoutPageSpan",
2053+
)
2054+
2055+
blocks: MutableSequence[
2056+
"Document.DocumentLayout.DocumentLayoutBlock"
2057+
] = proto.RepeatedField(
2058+
proto.MESSAGE,
2059+
number=1,
2060+
message="Document.DocumentLayout.DocumentLayoutBlock",
2061+
)
2062+
2063+
class ChunkedDocument(proto.Message):
2064+
r"""Represents the chunks that the document is divided into.
2065+
2066+
Attributes:
2067+
chunks (MutableSequence[google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk]):
2068+
List of chunks.
2069+
"""
2070+
2071+
class Chunk(proto.Message):
2072+
r"""Represents a chunk.
2073+
2074+
Attributes:
2075+
chunk_id (str):
2076+
ID of the chunk.
2077+
source_block_ids (MutableSequence[str]):
2078+
List of all parsed documents layout source
2079+
blocks used to generate the chunk.
2080+
content (str):
2081+
Text content of the chunk.
2082+
page_span (google.cloud.documentai_v1beta3.types.Document.ChunkedDocument.Chunk.ChunkPageSpan):
2083+
Page span of the chunk.
2084+
"""
2085+
2086+
class ChunkPageSpan(proto.Message):
2087+
r"""Represents where the chunk starts and ends in the document.
2088+
2089+
Attributes:
2090+
page_start (int):
2091+
Page where chunk starts in the document.
2092+
page_end (int):
2093+
Page where chunk ends in the document.
2094+
"""
2095+
2096+
page_start: int = proto.Field(
2097+
proto.INT32,
2098+
number=1,
2099+
)
2100+
page_end: int = proto.Field(
2101+
proto.INT32,
2102+
number=2,
2103+
)
2104+
2105+
chunk_id: str = proto.Field(
2106+
proto.STRING,
2107+
number=1,
2108+
)
2109+
source_block_ids: MutableSequence[str] = proto.RepeatedField(
2110+
proto.STRING,
2111+
number=2,
2112+
)
2113+
content: str = proto.Field(
2114+
proto.STRING,
2115+
number=3,
2116+
)
2117+
page_span: "Document.ChunkedDocument.Chunk.ChunkPageSpan" = proto.Field(
2118+
proto.MESSAGE,
2119+
number=4,
2120+
message="Document.ChunkedDocument.Chunk.ChunkPageSpan",
2121+
)
2122+
2123+
chunks: MutableSequence["Document.ChunkedDocument.Chunk"] = proto.RepeatedField(
2124+
proto.MESSAGE,
2125+
number=1,
2126+
message="Document.ChunkedDocument.Chunk",
2127+
)
2128+
18142129
uri: str = proto.Field(
18152130
proto.STRING,
18162131
number=1,
@@ -1869,6 +2184,16 @@ class TextChange(proto.Message):
18692184
number=13,
18702185
message=Revision,
18712186
)
2187+
document_layout: DocumentLayout = proto.Field(
2188+
proto.MESSAGE,
2189+
number=17,
2190+
message=DocumentLayout,
2191+
)
2192+
chunked_document: ChunkedDocument = proto.Field(
2193+
proto.MESSAGE,
2194+
number=18,
2195+
message=ChunkedDocument,
2196+
)
18722197

18732198

18742199
class RevisionRef(proto.Message):

0 commit comments

Comments
 (0)