Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR-#7294: Add similar methods as in 7294 for operating on columns #7314

Merged
merged 1 commit into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 7 additions & 10 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,10 @@ def maybe_build_dtypes_series(
Finds a union of columns and finds dtypes for all these columns.
"""
if not trigger_computations:
if not first._modin_frame.has_columns_cache:
if not first.frame_has_columns_cache:
return None

if (
isinstance(second, type(first))
and not second._modin_frame.has_columns_cache
):
if isinstance(second, type(first)) and not second.frame_has_columns_cache:
return None

columns_first = set(first.columns)
Expand Down Expand Up @@ -384,8 +381,8 @@ def caller(
if isinstance(other, type(query_compiler)):
if broadcast:
if (
query_compiler._modin_frame.has_materialized_columns
and other._modin_frame.has_materialized_columns
query_compiler.frame_has_materialized_columns
and other.frame_has_materialized_columns
):
if (
len(query_compiler.columns) == 1
Expand All @@ -408,8 +405,8 @@ def caller(
)
else:
if (
query_compiler._modin_frame.has_materialized_columns
and other._modin_frame.has_materialized_columns
query_compiler.frame_has_materialized_columns
and other.frame_has_materialized_columns
):
if (
len(query_compiler.columns) == 1
Expand Down Expand Up @@ -440,7 +437,7 @@ def caller(
)
else:
if (
query_compiler._modin_frame.has_materialized_columns
query_compiler.frame_has_materialized_columns
and len(query_compiler._modin_frame.columns) == 1
and is_scalar(other)
):
Expand Down
43 changes: 43 additions & 0 deletions modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4532,6 +4532,28 @@ def frame_has_materialized_dtypes(self) -> bool:
"""
return self._modin_frame.has_materialized_dtypes

@property
def frame_has_materialized_columns(self) -> bool:
"""
Check if the undelying dataframe has materialized columns.

Returns
-------
bool
"""
return self._modin_frame.has_materialized_columns

@property
def frame_has_materialized_index(self) -> bool:
"""
Check if the undelying dataframe has materialized index.

Returns
-------
bool
"""
return self._modin_frame.has_materialized_index

def set_frame_dtypes_cache(self, dtypes):
"""
Set dtypes cache for the underlying dataframe frame.
Expand All @@ -4552,6 +4574,16 @@ def set_frame_index_cache(self, index):
"""
self._modin_frame.set_index_cache(index)

def set_frame_columns_cache(self, index):
"""
Set columns cache for underlying dataframe.

Parameters
----------
index : sequence, callable or None
"""
self._modin_frame.set_columns_cache(index)

@property
def frame_has_index_cache(self):
"""
Expand All @@ -4563,6 +4595,17 @@ def frame_has_index_cache(self):
"""
return self._modin_frame.has_index_cache

@property
def frame_has_columns_cache(self):
"""
Check if the columns cache exists for underlying dataframe.

Returns
-------
bool
"""
return self._modin_frame.has_columns_cache

@property
def frame_has_dtypes_cache(self) -> bool:
"""
Expand Down
2 changes: 1 addition & 1 deletion modin/core/storage_formats/pandas/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def corr_method(
method=method, min_periods=min_periods, numeric_only=numeric_only
)

if not numeric_only and qc._modin_frame.has_materialized_columns:
if not numeric_only and qc.frame_has_materialized_columns:
new_index, new_columns = (
qc._modin_frame.copy_columns_cache(),
qc._modin_frame.copy_columns_cache(),
Expand Down
4 changes: 2 additions & 2 deletions modin/core/storage_formats/pandas/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def map_func(
# it's fine too, we can also decide that by columns, which tend to be already
# materialized quite often compared to the indexes.
keep_index = False
if left._modin_frame.has_materialized_index:
if left.frame_has_materialized_index:
keep_index = should_keep_index(left, right)
else:
# Have to trigger columns materialization. Hope they're already available at this point.
Expand Down Expand Up @@ -286,7 +286,7 @@ def _compute_result_metadata(
new_columns = None
new_dtypes = None

if not left._modin_frame.has_materialized_columns:
if not left.frame_has_materialized_columns:
return new_columns, new_dtypes

if left_on is None and right_on is None:
Expand Down
4 changes: 2 additions & 2 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover
new_columns = None
if kwargs["drop"]:
dtypes = self._modin_frame.copy_dtypes_cache()
if self._modin_frame.has_columns_cache:
if self.frame_has_columns_cache:
new_columns = self._modin_frame.copy_columns_cache(
copy_lengths=True
)
Expand All @@ -642,7 +642,7 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover
dtypes = None
if (
# can precompute new columns if we know columns and index names
self._modin_frame.has_materialized_columns
self.frame_has_materialized_columns
and index_dtypes is not None
):
empty_index = (
Expand Down
16 changes: 8 additions & 8 deletions modin/tests/core/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1171,13 +1171,13 @@ def test_concat_dont_materialize_opposite_axis(axis):

def assert_no_cache(df, axis):
if axis:
assert not df._query_compiler._modin_frame.has_materialized_columns
assert not df._query_compiler.frame_has_materialized_columns
else:
assert not df._query_compiler._modin_frame.has_materialized_index
assert not df._query_compiler.frame_has_materialized_index

def remove_cache(df, axis):
if axis:
df._query_compiler._modin_frame.set_columns_cache(None)
df._query_compiler.set_frame_columns_cache(None)
else:
df._query_compiler.set_frame_index_cache(None)
assert_no_cache(df, axis)
Expand Down Expand Up @@ -2038,7 +2038,7 @@ def test_concat_axis_1(
or remaining_dtype is not None
)
# setting columns cache to 'None', in order to prevent completing 'dtypes' with the materialized columns
md_df._query_compiler._modin_frame.set_columns_cache(None)
md_df._query_compiler.set_frame_columns_cache(None)
md_df._query_compiler.set_frame_dtypes_cache(
ModinDtypes(
DtypesDescriptor(
Expand Down Expand Up @@ -2401,10 +2401,10 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
# case 1: 'df' has complete dtype by default
df = pd.DataFrame({"a": [1, 2, 3]})
if has_materialized_index:
assert df._query_compiler._modin_frame.has_materialized_index
assert df._query_compiler.frame_has_materialized_index
else:
df._query_compiler.set_frame_index_cache(None)
assert not df._query_compiler._modin_frame.has_materialized_index
assert not df._query_compiler.frame_has_materialized_index
assert df._query_compiler.frame_has_materialized_dtypes

res = df.reset_index(drop=drop)
Expand Down Expand Up @@ -2444,10 +2444,10 @@ def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
)
)
if has_materialized_index:
assert df._query_compiler._modin_frame.has_materialized_index
assert df._query_compiler.frame_has_materialized_index
else:
df._query_compiler.set_frame_index_cache(None)
assert not df._query_compiler._modin_frame.has_materialized_index
assert not df._query_compiler.frame_has_materialized_index

res = df.reset_index(drop=drop)
if drop:
Expand Down
Loading