fix: issue a warning if buggy pyarrow is detected (#787)

plamut · web-flow · commit e403721af137 · 2021-07-21T11:59:49.000-05:00
Some pyarrow versions can cause issue when loading data from dataframe.
This commit detects if such pyarrow version is installed and warns the
user.
diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py
@@ -27,13 +27,16 @@
 import json
 import math
 import os
+import packaging.version
 import tempfile
 from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
 import uuid
 import warnings
 
 try:
     import pyarrow
+
+    _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
 except ImportError:  # pragma: NO COVER
     pyarrow = None
 
@@ -118,6 +121,9 @@
 # https://.com/googleapis/python-bigquery/issues/438
 _MIN_GET_QUERY_RESULTS_TIMEOUT = 120
 
+# https://.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
+_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])
+
 
 class Project(object):
     """Wrapper for resource describing a BigQuery project.
@@ -2609,6 +2615,15 @@ def load_table_from_dataframe(
         try:
 
             if job_config.source_format == job.SourceFormat.PARQUET:
+                if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
+                    msg = (
+                        "Loading dataframe data in PARQUET format with pyarrow "
+                        f"{_PYARROW_VERSION} can result in data corruption. It is "
+                        "therefore *strongly* advised to use a different pyarrow "
+                        "version or a different source format. "
+                        "See: https://.com/googleapis/python-bigquery/issues/781"
+                    )
+                    warnings.warn(msg, category=RuntimeWarning)
 
                 if job_config.schema:
                     if parquet_compression == "snappy":  # adjust the default value
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
@@ -27,6 +27,7 @@
 import warnings
 
 import mock
+import packaging
 import requests
 import pytest
 import pytz
@@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self):
                     parquet_compression="gzip",
                 )
 
+    def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self):
+        pytest.importorskip("pandas", reason="Requires `pandas`")
+        pytest.importorskip("pyarrow", reason="Requires `pyarrow`")
+
+        client = self._make_client()
+        records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
+        dataframe = pandas.DataFrame(records)
+
+        pyarrow_version_ = mock.(
+            "google.cloud.bigquery.client._PYARROW_VERSION",
+            packaging.version.parse("2.0.0"),  # A known bad version of pyarrow.
+        )
+        get_table_ = mock.(
+            "google.cloud.bigquery.client.Client.get_table",
+            autospec=True,
+            side_effect=google.api_core.exceptions.NotFound("Table not found"),
+        )
+        load_ = mock.(
+            "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
+        )
+
+        with load_, get_table_, pyarrow_version_:
+            with warnings.catch_warnings(record=True) as warned:
+                client.load_table_from_dataframe(
+                    dataframe, self.TABLE_REF, location=self.LOCATION,
+                )
+
+        expected_warnings = [
+            warning for warning in warned if "pyarrow" in str(warning).lower()
+        ]
+        assert len(expected_warnings) == 1
+        assert issubclass(expected_warnings[0].category, RuntimeWarning)
+        msg = str(expected_warnings[0].message)
+        assert "pyarrow 2.0.0" in msg
+        assert "data corruption" in msg
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
     def test_load_table_from_dataframe_w_nulls(self):