|
16 | 16 |
|
17 | 17 | from __future__ import annotations
|
18 | 18 |
|
19 |
| -import collections.abc |
20 | 19 | import copy
|
21 | 20 | import datetime
|
22 | 21 | import logging
|
|
81 | 80 | import bigframes.core as core
|
82 | 81 | import bigframes.core.blocks as blocks
|
83 | 82 | import bigframes.core.compile
|
| 83 | +import bigframes.core.expression as ex |
84 | 84 | import bigframes.core.guid
|
85 | 85 | import bigframes.core.nodes as nodes
|
86 | 86 | from bigframes.core.ordering import IntegerEncoding
|
87 | 87 | import bigframes.core.ordering as order
|
| 88 | +import bigframes.core.pruning |
88 | 89 | import bigframes.core.tree_properties as traversals
|
89 | 90 | import bigframes.core.tree_properties as tree_properties
|
90 | 91 | import bigframes.core.utils as utils
|
@@ -326,13 +327,15 @@ def session_id(self):
|
326 | 327 | @property
|
327 | 328 | def objects(
|
328 | 329 | self,
|
329 |
| -) -> collections.abc.Set[ |
| 330 | +) -> Tuple[ |
330 | 331 | Union[
|
331 | 332 | bigframes.core.indexes.Index, bigframes.series.Series, dataframe.DataFrame
|
332 | 333 | ]
|
333 | 334 | ]:
|
| 335 | +still_alive = [i for i in self._objects if i() is not None] |
| 336 | +self._objects = still_alive |
334 | 337 | # Create a set with strong references, be careful not to hold onto this needlessly, as will prevent garbage collection.
|
335 |
| -return set(i() for i in self._objects if i() is not None) # type: ignore |
| 338 | +return tuple(i() for i in self._objects if i() is not None) # type: ignore |
336 | 339 |
|
337 | 340 | @property
|
338 | 341 | def _project(self):
|
@@ -1913,6 +1916,51 @@ def _cache_with_offsets(self, array_value: core.ArrayValue):
|
1913 | 1916 | ).node
|
1914 | 1917 | self._cached_executions[array_value.node] = cached_replacement
|
1915 | 1918 |
|
| 1919 | +def _session_aware_caching(self, array_value: core.ArrayValue) -> None: |
| 1920 | +# this is the occurence count across the whole session |
| 1921 | +node_counts = traversals.count_nodes( |
| 1922 | +[obj._block.expr.node for obj in self.objects] |
| 1923 | +) |
| 1924 | +de_cachable_types = (nodes.FilterNode, nodes.ProjectionNode) |
| 1925 | +caching_target = array_value.node |
| 1926 | +caching_target_count = node_counts.get(caching_target, 0) |
| 1927 | + |
| 1928 | +cur_node = array_value.node |
| 1929 | + |
| 1930 | +# TODO: Identify filtered columns from FilterNode and use as cluster col(s) |
| 1931 | +filters: list[ |
| 1932 | +ex.Expression |
| 1933 | +] = [] # accumulate filters into this as traverse downwards |
| 1934 | +cluster_col: Optional[str] = None |
| 1935 | +while isinstance(cur_node, de_cachable_types): |
| 1936 | +if isinstance(cur_node, nodes.FilterNode): |
| 1937 | +filters.append(cur_node.predicate) |
| 1938 | +if isinstance(cur_node, nodes.ProjectionNode): |
| 1939 | +bindings = {name: expr for expr, name in cur_node.assignments} |
| 1940 | +filters = [i.bind_all_variables(bindings) for i in filters] |
| 1941 | + |
| 1942 | +cur_node = cur_node.child |
| 1943 | +cur_node_count = node_counts.get(cur_node, 0) |
| 1944 | +if cur_node_count > caching_target_count: |
| 1945 | +caching_target, caching_target_count = cur_node, cur_node_count |
| 1946 | +cluster_col = None |
| 1947 | +# Just pick the first cluster-compatible predicate |
| 1948 | +for predicate in filters: |
| 1949 | +# Cluster cols only consider the target object and not other sesssion objects |
| 1950 | +cluster_cols = bigframes.core.pruning.cluster_cols_for_predicate( |
| 1951 | +predicate |
| 1952 | +) |
| 1953 | +if len(cluster_cols) > 0: |
| 1954 | +cluster_col = cluster_cols[0] |
| 1955 | +continue |
| 1956 | + |
| 1957 | +if cluster_col: |
| 1958 | +self._cache_with_cluster_cols( |
| 1959 | +core.ArrayValue(caching_target), [cluster_col] |
| 1960 | +) |
| 1961 | +else: |
| 1962 | +self._cache_with_offsets(core.ArrayValue(caching_target)) |
| 1963 | + |
1916 | 1964 | def _simplify_with_caching(self, array_value: core.ArrayValue):
|
1917 | 1965 | """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces."""
|
1918 | 1966 | # Apply existing caching first
|
|
0 commit comments