|
50 | 50 | from google.cloud.bigquery.schema import _build_schema_resource
|
51 | 51 | from google.cloud.bigquery.schema import _parse_schema_resource
|
52 | 52 | from google.cloud.bigquery.schema import _to_schema_fields
|
53 |
| -from google.cloud.bigquery.exceptions import PyarrowMissingWarning |
54 | 53 | from google.cloud.bigquery.external_config import ExternalConfig
|
55 | 54 | from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
|
56 | 55 |
|
@@ -1679,75 +1678,38 @@ def to_dataframe(
|
1679 | 1678 | create_bqstorage_client = False
|
1680 | 1679 | bqstorage_client = None
|
1681 | 1680 |
|
1682 |
| -if pyarrow is not None: |
1683 |
| -# If pyarrow is available, calling to_arrow, then converting to a |
1684 |
| -# pandas dataframe is about 2x faster. This is because pandas.concat is |
1685 |
| -# rarely no-copy, whereas pyarrow.Table.from_batches + to_pandas is |
1686 |
| -# usually no-copy. |
1687 |
| -record_batch = self.to_arrow( |
1688 |
| -progress_bar_type=progress_bar_type, |
1689 |
| -bqstorage_client=bqstorage_client, |
1690 |
| -create_bqstorage_client=create_bqstorage_client, |
1691 |
| -) |
| 1681 | +record_batch = self.to_arrow( |
| 1682 | +progress_bar_type=progress_bar_type, |
| 1683 | +bqstorage_client=bqstorage_client, |
| 1684 | +create_bqstorage_client=create_bqstorage_client, |
| 1685 | +) |
| 1686 | + |
| 1687 | +# When converting timestamp values to nanosecond precision, the result |
| 1688 | +# can be out of pyarrow bounds. To avoid the error when converting to |
| 1689 | +# Pandas, we set the timestamp_as_object parameter to True, if necessary. |
| 1690 | +types_to_check = { |
| 1691 | +pyarrow.timestamp("us"), |
| 1692 | +pyarrow.timestamp("us", tz=pytz.UTC), |
| 1693 | +} |
1692 | 1694 |
|
1693 |
| -# When converting timestamp values to nanosecond precision, the result |
1694 |
| -# can be out of pyarrow bounds. To avoid the error when converting to |
1695 |
| -# Pandas, we set the timestamp_as_object parameter to True, if necessary. |
1696 |
| -types_to_check = { |
1697 |
| -pyarrow.timestamp("us"), |
1698 |
| -pyarrow.timestamp("us", tz=pytz.UTC), |
1699 |
| -} |
1700 |
| - |
1701 |
| -for column in record_batch: |
1702 |
| -if column.type in types_to_check: |
1703 |
| -try: |
1704 |
| -column.cast("timestamp[ns]") |
1705 |
| -except pyarrow.lib.ArrowInvalid: |
1706 |
| -timestamp_as_object = True |
1707 |
| -break |
1708 |
| -else: |
1709 |
| -timestamp_as_object = False |
1710 |
| - |
1711 |
| -extra_kwargs = {"timestamp_as_object": timestamp_as_object} |
1712 |
| - |
1713 |
| -df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) |
1714 |
| - |
1715 |
| -for column in dtypes: |
1716 |
| -df[column] = pandas.Series(df[column], dtype=dtypes[column]) |
1717 |
| -return df |
| 1695 | +for column in record_batch: |
| 1696 | +if column.type in types_to_check: |
| 1697 | +try: |
| 1698 | +column.cast("timestamp[ns]") |
| 1699 | +except pyarrow.lib.ArrowInvalid: |
| 1700 | +timestamp_as_object = True |
| 1701 | +break |
1718 | 1702 | else:
|
1719 |
| -warnings.warn( |
1720 |
| -"Converting to a dataframe without pyarrow installed is " |
1721 |
| -"often slower and will become unsupported in the future. " |
1722 |
| -"Please install the pyarrow package.", |
1723 |
| -PyarrowMissingWarning, |
1724 |
| -stacklevel=2, |
1725 |
| -) |
| 1703 | +timestamp_as_object = False |
1726 | 1704 |
|
1727 |
| -# The bqstorage_client is only used if pyarrow is available, so the |
1728 |
| -# rest of this method only needs to account for tabledata.list. |
1729 |
| -progress_bar = self._get_progress_bar(progress_bar_type) |
| 1705 | +extra_kwargs = {"timestamp_as_object": timestamp_as_object} |
1730 | 1706 |
|
1731 |
| -frames = [] |
1732 |
| -for frame in self.to_dataframe_iterable(dtypes=dtypes): |
1733 |
| -frames.append(frame) |
| 1707 | +df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) |
1734 | 1708 |
|
1735 |
| -if progress_bar is not None: |
1736 |
| -# In some cases, the number of total rows is not populated |
1737 |
| -# until the first page of rows is fetched. Update the |
1738 |
| -# progress bar's total to keep an accurate count. |
1739 |
| -progress_bar.total = progress_bar.total or self.total_rows |
1740 |
| -progress_bar.update(len(frame)) |
1741 |
| - |
1742 |
| -if progress_bar is not None: |
1743 |
| -# Indicate that the download has finished. |
1744 |
| -progress_bar.close() |
1745 |
| - |
1746 |
| -# Avoid concatting an empty list. |
1747 |
| -if not frames: |
1748 |
| -column_names = [field.name for field in self._schema] |
1749 |
| -return pandas.DataFrame(columns=column_names) |
1750 |
| -return pandas.concat(frames, ignore_index=True) |
| 1709 | +for column in dtypes: |
| 1710 | +df[column] = pandas.Series(df[column], dtype=dtypes[column]) |
| 1711 | + |
| 1712 | +return df |
1751 | 1713 |
|
1752 | 1714 |
|
1753 | 1715 | class _EmptyRowIterator(object):
|
|
0 commit comments