Merged
Show file tree
Hide file tree
Changes from all commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d141a67
feat: add get_document and list_document functions
galz10Sep 27, 2022
1de3f02
fixed tests
galz10Sep 27, 2022
1b9324f
lint fix
galz10Sep 27, 2022
446e187
added tests and changed DocumentWrapper
galz10Sep 28, 2022
ed86e84
fixed failing test
galz10Sep 28, 2022
9fc848b
updated tests
galz10Sep 29, 2022
88001da
changed DocStrings and tests
galz10Sep 29, 2022
1090173
fixed failing test
galz10Sep 29, 2022
0cec6b5
changed name and return type of list_documents
galz10Sep 30, 2022
0b67124
updated failing tests
galz10Sep 30, 2022
01e551f
lint fix
galz10Sep 30, 2022
c25faf3
chore: updated comments
galz10Sep 30, 2022
cf8abef
updating naming for get_document to get_shards
galz10Oct 3, 2022
a8bbf7a
revert get_document changes
galz10Oct 3, 2022
3c547e0
Merge branch 'main' into update_comments
galz10Oct 3, 2022
8df5b0b
lint fix
galz10Oct 3, 2022
6ac3b43
added code-block to comments
galz10Oct 4, 2022
0b1b52a
feat: add TableWrapper and helper functions
galz10Oct 4, 2022
fb81212
wrapped lines and paragraphs
galz10Oct 6, 2022
eaabc42
Merge branch 'main' into wrap-table
galz10Oct 6, 2022
efc302f
Merge branch 'main' into wrap-table
galz10Oct 7, 2022
88bbe01
added tests for new features
galz10Oct 7, 2022
f7ede09
lint fix
galz10Oct 7, 2022
86c753c
feat: added helper functions to DocumentWrapper
galz10Oct 12, 2022
8bcf3e2
lint fix
galz10Oct 12, 2022
1c88552
fixed failing test
galz10Oct 12, 2022
5ffd706
refactored code
galz10Oct 12, 2022
224b589
lint fix
galz10Oct 12, 2022
69e847c
lint fix
galz10Oct 12, 2022
f9975bc
refactored code
galz10Oct 13, 2022
63e00fd
Merge branch 'main' into add-helpers
galz10Oct 18, 2022
4dc1ec6
fixed failing test
galz10Oct 18, 2022
91f348f
refactored code
galz10Oct 18, 2022
328ee4c
Merge branch 'main' into add-helpers
galz10Oct 19, 2022
249983c
added text fixture to simplify testing
galz10Oct 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Failed to load files.
Original file line numberDiff line numberDiff line change
Expand Up@@ -36,8 +36,8 @@ def _entities_from_shards(
Required. List of document shards.

Returns:
List[wrapped_entity.Entity]:
a list of Entitys.
List[Entity]:
a list of Entity.

"""
result = []
Expand All@@ -55,8 +55,8 @@ def _pages_from_shards(shards: documentai.Document) -> List[Page]:
Required. List of document shards.

Returns:
List[wrapped_page.Page]:
A list of Pages.
List[Page]:
A list of Page.

"""
result = []
Expand DownExpand Up@@ -227,3 +227,54 @@ def __post_init__(self):
self._shards = _get_shards(gcs_prefix=self.gcs_prefix)
self.pages = _pages_from_shards(shards=self._shards)
self.entities = _entities_from_shards(shards=self._shards)

def search_pages(
self, target_string: str = None, pattern: str = None
) -> List[Page]:
r"""Returns the list of Page containing target_string.

Args:
target_string (str):
Optional. target str.
pattern (str):
Optional. regex str.

Returns:
List[Page]:
A list of Page.

"""
if (target_string is None and pattern is None) or (
target_string is not None and pattern is not None
):
raise ValueError(
"Exactly one of target_string and pattern must be specified."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe "or" instead of "and".

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is checking if both target_string and pattern are populated so it needs an "and" to make sure both are not none.

)

found_pages = []
for page in self.pages:
for paragraph in page.paragraphs:
if target_string is not None and target_string in paragraph.text:
found_pages.append(page)
break
elif (
pattern is not None
and re.search(pattern, paragraph.text) is not None
):
found_pages.append(page)
break
return found_pages

def get_entity_by_type(self, target_type: str) -> List[Entity]:
r"""Returns a list of wrapped entities matching target_type.

Args:
target_type (str):
Required. target_type.

Returns:
List[Entity]:
A list of Entity matching target_type.

"""
return [entity for entity in self.entities if entity.type_ == target_type]
Original file line numberDiff line numberDiff line change
Expand Up@@ -142,7 +142,7 @@ def _table_wrapper_from_documentai_table(
class Paragraph:
"""Represents a wrapped documentai.Document.Page.Paragraph.
Attributes:
_documentai_table (google.cloud.documentai.Document.Page.Paragraph):
documentai_paragraph (google.cloud.documentai.Document.Page.Paragraph):
Required.The original google.cloud.documentai.Document.Page.Paragraph object.
text (str):
Required. UTF-8 encoded text.
Expand All@@ -156,7 +156,7 @@ class Paragraph:
class Line:
"""Represents a wrapped documentai.Document.Page.Line.
Attributes:
_documentai_line (google.cloud.documentai.Document.Page.Line):
documentai_line (google.cloud.documentai.Document.Page.Line):
Required.The original google.cloud.documentai.Document.Page.Line object.
text (str):
Required. UTF-8 encoded text.
Expand All@@ -169,16 +169,16 @@ class Line:
def _get_paragraphs(
paragraphs: List[documentai.Document.Page.Paragraph], text: str
) -> List[Paragraph]:
r"""Returns a list of ParagraphWrapper.
r"""Returns a list of Paragraph.
Args:
paragraphs (List[documentai.Document.Page.Paragraph]):
Required. a list of documentai.Document.Page.Paragraph objects.
text (str):
Required. UTF-8 encoded text in reading order
from the document.
Returns:
List[str]:
A list of texts from a List[ParagraphWrapper].
List[Paragraph]:
A list of Paragraph.
"""
result = []

Expand All@@ -196,16 +196,16 @@ def _get_paragraphs(


def _get_lines(lines: List[documentai.Document.Page.Line], text: str) -> List[Line]:
r"""Returns a list of LineWrapper.
r"""Returns a list of Line.
Args:
paragraphs (List[documentai.Document.Page.Line]):
Required. a list of documentai.Document.Page.Line objects.
text (str):
Required. UTF-8 encoded text in reading order
from the document.
Returns:
List[str]:
A list of texts from a List[LineWrapper].
List[Line]:
A list of Line.
"""
result = []

Expand Down
Original file line numberDiff line numberDiff line change
Expand Up@@ -39,6 +39,20 @@ def get_bytes(file_name):
return result


@pytest.fixture
def get_bytes_single_file_mock():
with mock..object(document, "_get_bytes") as byte_factory:
byte_factory.return_value = get_bytes("tests/unit/resources/0")
yield byte_factory


@pytest.fixture
def get_bytes_multiple_files_mock():
with mock..object(document, "_get_bytes") as byte_factory:
byte_factory.return_value = get_bytes("tests/unit/resources/1")
yield byte_factory


def test_get_shards_with_gcs_uri_contains_file_type():
with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"):
document._get_shards("gs://test-directory/documentai/output/123456789/0.json")
Expand All@@ -49,14 +63,12 @@ def test_get_shards_with_invalid_gcs_uri():
document._get_shards("test-directory/documentai/output/")


def test_get_shards_with_valid_gcs_uri():
with mock..object(document, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/0")
actual = document._get_shards(
"gs://test-directory/documentai/output/123456789/0"
)
# We are testing only one of the fields to make sure the file content could be loaded.
assert actual[0].pages[0].page_number == 1
def test_get_shards_with_valid_gcs_uri(get_bytes_single_file_mock):
actual = document._get_shards("gs://test-directory/documentai/output/123456789/0")

get_bytes_single_file_mock.called_once()
# We are testing only one of the fields to make sure the file content could be loaded.
assert actual[0].pages[0].page_number == 1


def test_pages_from_shards():
Expand All@@ -79,18 +91,16 @@ def test_entities_from_shard():
assert actual[0].type_ == "vat"


def test_wrapped_document_with_single_shard():
with mock..object(document, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/0")
actual = document.Document("gs://test-directory/documentai/output/123456789/0")
assert len(actual.pages) == 1
def test_wrapped_document_with_single_shard(get_bytes_single_file_mock):
actual = document.Document("gs://test-directory/documentai/output/123456789/0")
get_bytes_single_file_mock.called_once()
assert len(actual.pages) == 1


def test_wrapped_document_with_multiple_shards():
with mock..object(document, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/1")
actual = document.Document("gs://test-directory/documentai/output/123456789/1")
assert len(actual.pages) == 48
def test_wrapped_document_with_multiple_shards(get_bytes_multiple_files_mock):
actual = document.Document("gs://test-directory/documentai/output/123456789/1")
get_bytes_multiple_files_mock.called_once()
assert len(actual.pages) == 48


@mock.("google.cloud.documentai_toolbox.wrappers.document.storage")
Expand DownExpand Up@@ -235,3 +245,58 @@ def test_print_gcs_document_tree_with_gcs_uri_contains_file_type():
def test_print_gcs_document_tree_with_invalid_gcs_uri():
with pytest.raises(ValueError, match="gcs_prefix does not match accepted format"):
document.print_gcs_document_tree("documentai/output/123456789/1")


def test_search_page_with_target_string(get_bytes_single_file_mock):

doc = document.Document("gs://test-directory/documentai/output/123456789/0")

actual_string = doc.search_pages(target_string="contract")

get_bytes_single_file_mock.called_once()
assert len(actual_string) == 1


def test_search_page_with_target_pattern(get_bytes_single_file_mock):
doc = document.Document("gs://test-directory/documentai/output/123456789/0")

actual_regex = doc.search_pages(pattern=r"\$\d+(?:\.\d+)?")

get_bytes_single_file_mock.called_once()
assert len(actual_regex) == 1


def test_search_page_with_regex_and_str(get_bytes_single_file_mock):
with pytest.raises(
ValueError,
match="Exactly one of target_string and pattern must be specified.",
):

doc = document.Document("gs://test-directory/documentai/output/123456789/0")
doc.search_pages(pattern=r"^\$?(\d*(\d\.?|\.\d{1,2}))$", target_string="hello")

get_bytes_single_file_mock.called_once()


def test_search_page_with_none(get_bytes_single_file_mock):
with pytest.raises(
ValueError,
match="Exactly one of target_string and pattern must be specified.",
):
doc = document.Document("gs://test-directory/documentai/output/123456789/0")
doc.search_pages()

get_bytes_single_file_mock.called_once()


def test_get_entity_by_type(get_bytes_single_file_mock):

doc = document.Document("gs://test-directory/documentai/output/123456789/0")

actual = doc.get_entity_by_type(target_type="receiver_address")

get_bytes_single_file_mock.called_once()

assert len(actual) == 1
assert actual[0].type_ == "receiver_address"
assert actual[0].mention_text == "222 Main Street\nAnytown, USA"