File tree

2 files changed

+52
-0
lines changed

2 files changed

+52
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727
import json
2828
import math
2929
import os
30+
import packaging.version
3031
import tempfile
3132
from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
3233
import uuid
3334
import warnings
3435

3536
try:
3637
import pyarrow
38+
39+
_PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
3740
except ImportError: # pragma: NO COVER
3841
pyarrow = None
3942

@@ -118,6 +121,9 @@
118121
# https://.com/googleapis/python-bigquery/issues/438
119122
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120
120123

124+
# https://.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
125+
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])
126+
121127

122128
class Project(object):
123129
"""Wrapper for resource describing a BigQuery project.
@@ -2609,6 +2615,15 @@ def load_table_from_dataframe(
26092615
try:
26102616

26112617
if job_config.source_format == job.SourceFormat.PARQUET:
2618+
if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
2619+
msg = (
2620+
"Loading dataframe data in PARQUET format with pyarrow "
2621+
f"{_PYARROW_VERSION} can result in data corruption. It is "
2622+
"therefore *strongly* advised to use a different pyarrow "
2623+
"version or a different source format. "
2624+
"See: https://.com/googleapis/python-bigquery/issues/781"
2625+
)
2626+
warnings.warn(msg, category=RuntimeWarning)
26122627

26132628
if job_config.schema:
26142629
if parquet_compression == "snappy": # adjust the default value
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import warnings
2828

2929
import mock
30+
import packaging
3031
import requests
3132
import pytest
3233
import pytz
@@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self):
75107511
parquet_compression="gzip",
75117512
)
75127513

7514+
def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self):
7515+
pytest.importorskip("pandas", reason="Requires `pandas`")
7516+
pytest.importorskip("pyarrow", reason="Requires `pyarrow`")
7517+
7518+
client = self._make_client()
7519+
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
7520+
dataframe = pandas.DataFrame(records)
7521+
7522+
pyarrow_version_ = mock.(
7523+
"google.cloud.bigquery.client._PYARROW_VERSION",
7524+
packaging.version.parse("2.0.0"), # A known bad version of pyarrow.
7525+
)
7526+
get_table_ = mock.(
7527+
"google.cloud.bigquery.client.Client.get_table",
7528+
autospec=True,
7529+
side_effect=google.api_core.exceptions.NotFound("Table not found"),
7530+
)
7531+
load_ = mock.(
7532+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
7533+
)
7534+
7535+
with load_, get_table_, pyarrow_version_:
7536+
with warnings.catch_warnings(record=True) as warned:
7537+
client.load_table_from_dataframe(
7538+
dataframe, self.TABLE_REF, location=self.LOCATION,
7539+
)
7540+
7541+
expected_warnings = [
7542+
warning for warning in warned if "pyarrow" in str(warning).lower()
7543+
]
7544+
assert len(expected_warnings) == 1
7545+
assert issubclass(expected_warnings[0].category, RuntimeWarning)
7546+
msg = str(expected_warnings[0].message)
7547+
assert "pyarrow 2.0.0" in msg
7548+
assert "data corruption" in msg
7549+
75137550
@unittest.skipIf(pandas is None, "Requires `pandas`")
75147551
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
75157552
def test_load_table_from_dataframe_w_nulls(self):

0 commit comments

Comments
 (0)