feat: Add Series.peek to preview data efficiently

TrevorBergeron · TrevorBergeron · commit 5a803a494a78 · 2024-05-28T22:25:43.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -2054,13 +2054,14 @@ def to_sql_query(
             idx_labels,
         )
 
-    def cached(self, *, optimize_offsets=False, force: bool = False) -> None:
+    # Three strategies,
+    def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
         """Write the block to a session table."""
         # use a heuristic for whether something needs to be cached
         if (not force) and self.session._is_trivially_executable(self.expr):
             return
-        if optimize_offsets:
-            self.session._cache_with_offsets(self.expr)
+        elif session_aware:
+            self.session._session_aware_caching(self.expr)
         else:
             self.session._cache_with_cluster_cols(
                 self.expr, cluster_cols=self.index_columns
diff --git a/bigframes/core/pruning.py b/bigframes/core/pruning.py
@@ -0,0 +1,64 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Sequence
+
+import bigframes.core.expression as ex
+import bigframes.operations as ops
+
+COMPARISON_OP_TYPES = tuple(
+    type(i)
+    for i in (
+        ops.eq_op,
+        ops.eq_null_match_op,
+        ops.ne_op,
+        ops.gt_op,
+        ops.ge_op,
+        ops.lt_op,
+        ops.le_op,
+    )
+)
+
+
+def cluster_cols_for_predicate(predicate: ex.Expression) -> Sequence[str]:
+    """Try to determine cluster col candidates that work with given predicates."""
+    if isinstance(predicate, ex.UnboundVariableExpression):
+        return [predicate.id]
+    if isinstance(predicate, ex.OpExpression):
+        op = predicate.op
+        if isinstance(op, COMPARISON_OP_TYPES):
+            return cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1])
+        if isinstance(op, (type(ops.invert_op))):
+            return cluster_cols_for_predicate(predicate.inputs[0])
+        if isinstance(op, (type(ops.and_op), type(ops.or_op))):
+            left_cols = cluster_cols_for_predicate(predicate.inputs[0])
+            right_cols = cluster_cols_for_predicate(predicate.inputs[1])
+            return [*left_cols, *[col for col in right_cols if col not in left_cols]]
+        else:
+            return []
+    else:
+        # Constant
+        return []
+
+
+def cluster_cols_for_comparison(
+    left_ex: ex.Expression, right_ex: ex.Expression
+) -> Sequence[str]:
+    if left_ex.is_const:
+        if isinstance(right_ex, ex.UnboundVariableExpression):
+            return [right_ex.id]
+    elif right_ex.is_const:
+        if isinstance(left_ex, ex.UnboundVariableExpression):
+            return [left_ex.id]
+    return []
diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py
@@ -15,7 +15,7 @@
 
 import functools
 import itertools
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Sequence
 
 import bigframes.core.nodes as nodes
 
@@ -91,6 +91,30 @@ def _node_counts_inner(
     )
 
 
+def count_nodes(forest: Sequence[nodes.BigFrameNode]) -> dict[nodes.BigFrameNode, int]:
+    def _combine_counts(
+        left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
+    ) -> Dict[nodes.BigFrameNode, int]:
+        return {
+            key: left.get(key, 0) + right.get(key, 0)
+            for key in itertools.chain(left.keys(), right.keys())
+        }
+
+    empty_counts: Dict[nodes.BigFrameNode, int] = {}
+
+    @functools.cache
+    def _node_counts_inner(
+        subtree: nodes.BigFrameNode,
+    ) -> Dict[nodes.BigFrameNode, int]:
+        """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
+        child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
+        node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
+        return _combine_counts(node_counts, {subtree: 1})
+
+    counts = [_node_counts_inner(root) for root in forest]
+    return functools.reduce(_combine_counts, counts, empty_counts)
+
+
 def replace_nodes(
     root: nodes.BigFrameNode,
     replacements: dict[nodes.BigFrameNode, nodes.BigFrameNode],
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -60,6 +60,11 @@ def order_preserving(self) -> bool:
         """Whether the row operation preserves total ordering. Can be pruned from ordering expressions."""
         return False
 
+    @property
+    def pruning_compatible(self) -> bool:
+        """Whether the operation preserves locality o"""
+        return False
+
 
 @dataclasses.dataclass(frozen=True)
 class NaryOp(ScalarOp):
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -617,6 +617,39 @@ def head(self, n: int = 5) -> Series:
     def tail(self, n: int = 5) -> Series:
         return typing.cast(Series, self.iloc[-n:])
 
+    def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
+        """
+        Preview n arbitrary elements from the series. No guarantees about row selection or ordering.
+        ``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
+        full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
+        Query results will be cached so that future steps will benefit from these queries.
+
+        Args:
+            n (int, default 5):
+                The number of rows to select from the series. Which N rows are returned is non-deterministic.
+            force (bool, default True):
+                If the data cannot be peeked efficiently, the series will instead be fully materialized as part
+                of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
+        Returns:
+            pandas.Series: A pandas Series with n rows.
+
+        Raises:
+            ValueError: If force=False and data cannot be efficiently peeked.
+        """
+        maybe_result = self._block.try_peek(n)
+        if maybe_result is None:
+            if force:
+                self._cached()
+                maybe_result = self._block.try_peek(n, force=True)
+                assert maybe_result is not None
+            else:
+                raise ValueError(
+                    "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
+                )
+        as_series = maybe_result.squeeze(axis=1)
+        as_series.name = self.name
+        return as_series
+
     def nlargest(self, n: int = 5, keep: str = "first") -> Series:
         if keep not in ("first", "last", "all"):
             raise ValueError("'keep must be one of 'first', 'last', or 'all'")
@@ -1400,7 +1433,7 @@ def apply(
 
         # return Series with materialized result so that any error in the remote
         # function is caught early
-        materialized_series = result_series._cached()
+        materialized_series = result_series._cached(session_aware=False)
         return materialized_series
 
     def combine(
@@ -1775,10 +1808,11 @@ def cache(self):
         Returns:
             Series: Self
         """
-        return self._cached(force=True)
+        # Do not use session-aware cashing if user-requested
+        return self._cached(force=True, session_aware=False)
 
-    def _cached(self, *, force: bool = True) -> Series:
-        self._block.cached(force=force)
+    def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
+        self._block.cached(force=force, session_aware=session_aware)
         return self
 
     def _optimize_query_complexity(self):
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -16,7 +16,6 @@
 
 from __future__ import annotations
 
-import collections.abc
 import copy
 import datetime
 import logging
@@ -81,10 +80,12 @@
 import bigframes.core as core
 import bigframes.core.blocks as blocks
 import bigframes.core.compile
+import bigframes.core.expression as ex
 import bigframes.core.guid
 import bigframes.core.nodes as nodes
 from bigframes.core.ordering import IntegerEncoding
 import bigframes.core.ordering as order
+import bigframes.core.pruning
 import bigframes.core.tree_properties as traversals
 import bigframes.core.tree_properties as tree_properties
 import bigframes.core.utils as utils
@@ -326,13 +327,15 @@ def session_id(self):
     @property
     def objects(
         self,
-    ) -> collections.abc.Set[
+    ) -> Tuple[
         Union[
             bigframes.core.indexes.Index, bigframes.series.Series, dataframe.DataFrame
         ]
     ]:
+        still_alive = [i for i in self._objects if i() is not None]
+        self._objects = still_alive
         # Create a set with strong references, be careful not to hold onto this needlessly, as will prevent garbage collection.
-        return set(i() for i in self._objects if i() is not None)  # type: ignore
+        return tuple(i() for i in self._objects if i() is not None)  # type: ignore
 
     @property
     def _project(self):
@@ -1913,6 +1916,51 @@ def _cache_with_offsets(self, array_value: core.ArrayValue):
         ).node
         self._cached_executions[array_value.node] = cached_replacement
 
+    def _session_aware_caching(self, array_value: core.ArrayValue) -> None:
+        # this is the occurence count across the whole session
+        node_counts = traversals.count_nodes(
+            [obj._block.expr.node for obj in self.objects]
+        )
+        de_cachable_types = (nodes.FilterNode, nodes.ProjectionNode)
+        caching_target = array_value.node
+        caching_target_count = node_counts.get(caching_target, 0)
+
+        cur_node = array_value.node
+
+        # TODO: Identify filtered columns from FilterNode and use as cluster col(s)
+        filters: list[
+            ex.Expression
+        ] = []  # accumulate filters into this as traverse downwards
+        cluster_col: Optional[str] = None
+        while isinstance(cur_node, de_cachable_types):
+            if isinstance(cur_node, nodes.FilterNode):
+                filters.append(cur_node.predicate)
+            if isinstance(cur_node, nodes.ProjectionNode):
+                bindings = {name: expr for expr, name in cur_node.assignments}
+                filters = [i.bind_all_variables(bindings) for i in filters]
+
+            cur_node = cur_node.child
+            cur_node_count = node_counts.get(cur_node, 0)
+            if cur_node_count > caching_target_count:
+                caching_target, caching_target_count = cur_node, cur_node_count
+                cluster_col = None
+                # Just pick the first cluster-compatible predicate
+                for predicate in filters:
+                    # Cluster cols only consider the target object and not other sesssion objects
+                    cluster_cols = bigframes.core.pruning.cluster_cols_for_predicate(
+                        predicate
+                    )
+                    if len(cluster_cols) > 0:
+                        cluster_col = cluster_cols[0]
+                        continue
+
+        if cluster_col:
+            self._cache_with_cluster_cols(
+                core.ArrayValue(caching_target), [cluster_col]
+            )
+        else:
+            self._cache_with_offsets(core.ArrayValue(caching_target))
+
     def _simplify_with_caching(self, array_value: core.ArrayValue):
         """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces."""
         # Apply existing caching first
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -1936,6 +1936,41 @@ def test_head_then_series_operation(scalars_dfs):
     )
 
 
+def test_series_peek(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    peek_result = scalars_df["float64_col"].peek(n=3, force=False)
+    pd.testing.assert_series_equal(
+        peek_result,
+        scalars_pandas_df["float64_col"].reindex_like(peek_result),
+    )
+
+
+def test_series_peek_filtered(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    peek_result = scalars_df[scalars_df.int64_col > 0]["float64_col"].peek(
+        n=3, force=False
+    )
+    pd_result = scalars_pandas_df[scalars_pandas_df.int64_col > 0]["float64_col"]
+    pd.testing.assert_series_equal(
+        peek_result,
+        pd_result.reindex_like(peek_result),
+    )
+
+
+def test_series_peek_force(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    cumsum_df = scalars_df[["int64_col", "int64_too"]].cumsum()
+    df_filtered = cumsum_df[cumsum_df.int64_col > 0]["int64_too"]
+    peek_result = df_filtered.peek(n=3, force=True)
+    pd_cumsum_df = scalars_pandas_df[["int64_col", "int64_too"]].cumsum()
+    pd_result = pd_cumsum_df[pd_cumsum_df.int64_col > 0]["int64_too"]
+    pd.testing.assert_series_equal(
+        peek_result,
+        pd_result.reindex_like(peek_result),
+    )
+
+
 def test_shift(scalars_df_index, scalars_pandas_df_index):
     col_name = "int64_col"
     bf_result = scalars_df_index[col_name].shift().to_pandas()