# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
import numpy as np
import pandas as pd
from ... import dataframe as md
from ... import opcodes
from ... import tensor as mt
from ...core import OutputType
from ...serialization.serializables import BoolField
from ..operands import (
DATAFRAME_TYPE,
ENTITY_TYPE,
INDEX_TYPE,
SERIES_TYPE,
TENSOR_TYPE,
DataFrameOperand,
DataFrameOperandMixin,
)
class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin):
_op_type_ = opcodes.CHECK_NA
_positive = BoolField("positive")
def __init__(self, positive=None, sparse=None, output_types=None, **kw):
super().__init__(
_positive=positive,
_output_types=output_types,
sparse=sparse,
**kw,
)
@property
def positive(self) -> bool:
return self._positive
def __call__(self, df):
if isinstance(df, DATAFRAME_TYPE):
self.output_types = [OutputType.dataframe]
elif isinstance(df, SERIES_TYPE):
self.output_types = [OutputType.series]
elif isinstance(df, TENSOR_TYPE) or isinstance(df, INDEX_TYPE):
self.output_types = [OutputType.tensor]
else:
raise TypeError(
f"Expecting mars dataframe, series, index, or tensor, got {type(df)}"
)
params = df.params.copy()
if self.output_types[0] == OutputType.dataframe:
params["dtypes"] = pd.Series(
[np.dtype("bool")] * len(df.dtypes), index=df.columns_value.to_pandas()
)
else:
params["dtype"] = np.dtype("bool")
return self.new_tileable([df], **params)
@classmethod
def tile(cls, op: "DataFrameCheckNA"):
in_df = op.inputs[0]
out_df = op.outputs[0]
chunks = []
for c in in_df.chunks:
params = c.params.copy()
if op.output_types[0] == OutputType.dataframe:
params["dtypes"] = pd.Series(
[np.dtype("bool")] * len(c.dtypes),
index=c.columns_value.to_pandas(),
)
else:
params["dtype"] = np.dtype("bool")
new_op = op.copy().reset_key()
chunks.append(new_op.new_chunk([c], **params))
new_op = op.copy().reset_key()
params = out_df.params.copy()
params.update(dict(chunks=chunks, nsplits=in_df.nsplits))
return new_op.new_tileables([in_df], **params)
@classmethod
def execute(cls, ctx, op: "DataFrameCheckNA"):
in_data = ctx[op.inputs[0].key]
if op.positive:
ctx[op.outputs[0].key] = in_data.isna()
else:
ctx[op.outputs[0].key] = in_data.notna()
def _from_pandas(obj: Any):
if isinstance(obj, pd.DataFrame):
from ..datasource.dataframe import from_pandas
return from_pandas(obj)
elif isinstance(obj, pd.Series):
from ..datasource.series import from_pandas
return from_pandas(obj)
elif isinstance(obj, np.ndarray):
return mt.tensor(obj)
else:
return obj
[文档]def isna(obj):
"""
Detect missing values.
Return a boolean same-sized object indicating if the values are NA.
NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
values.
Everything else gets mapped to False values. Characters such as empty
strings ``''`` or :attr:`numpy.inf` are not considered NA values
(unless you set ``pandas.options.mode.use_inf_as_na = True``).
Returns
-------
DataFrame
Mask of bool values for each element in DataFrame that
indicates whether an element is not an NA value.
See Also
--------
DataFrame.isnull : Alias of isna.
DataFrame.notna : Boolean inverse of isna.
DataFrame.dropna : Omit axes labels with missing values.
isna : Top-level isna.
Examples
--------
Show which entries in a DataFrame are NA.
>>> import numpy as np
>>> import mars.dataframe as md
>>> df = md.DataFrame({'age': [5, 6, np.NaN],
... 'born': [md.NaT, md.Timestamp('1939-05-27'),
... md.Timestamp('1940-04-25')],
... 'name': ['Alfred', 'Batman', ''],
... 'toy': [None, 'Batmobile', 'Joker']})
>>> df.execute()
age born name toy
0 5.0 NaT Alfred None
1 6.0 1939-05-27 Batman Batmobile
2 NaN 1940-04-25 Joker
>>> df.isna().execute()
age born name toy
0 False True False True
1 False False False False
2 True False False False
Show which entries in a Series are NA.
>>> ser = md.Series([5, 6, np.NaN])
>>> ser.execute()
0 5.0
1 6.0
2 NaN
dtype: float64
>>> ser.isna().execute()
0 False
1 False
2 True
dtype: bool
"""
if isinstance(obj, md.MultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, ENTITY_TYPE):
if isinstance(obj, TENSOR_TYPE):
return mt.isnan(obj)
else:
op = DataFrameCheckNA(positive=True)
return op(obj)
else:
return _from_pandas(pd.isna(obj))
[文档]def notna(obj):
"""
Detect existing (non-missing) values.
Return a boolean same-sized object indicating if the values are not NA.
Non-missing values get mapped to True. Characters such as empty
strings ``''`` or :attr:`numpy.inf` are not considered NA values
(unless you set ``pandas.options.mode.use_inf_as_na = True``).
NA values, such as None or :attr:`numpy.NaN`, get mapped to False
values.
Returns
-------
DataFrame
Mask of bool values for each element in DataFrame that
indicates whether an element is not an NA value.
See Also
--------
DataFrame.notnull : Alias of notna.
DataFrame.isna : Boolean inverse of notna.
DataFrame.dropna : Omit axes labels with missing values.
notna : Top-level notna.
Examples
--------
Show which entries in a DataFrame are not NA.
>>> import numpy as np
>>> import mars.dataframe as md
>>> df = md.DataFrame({'age': [5, 6, np.NaN],
... 'born': [md.NaT, md.Timestamp('1939-05-27'),
... md.Timestamp('1940-04-25')],
... 'name': ['Alfred', 'Batman', ''],
... 'toy': [None, 'Batmobile', 'Joker']})
>>> df.execute()
age born name toy
0 5.0 NaT Alfred None
1 6.0 1939-05-27 Batman Batmobile
2 NaN 1940-04-25 Joker
>>> df.notna().execute()
age born name toy
0 True False True False
1 True True True True
2 False True True True
Show which entries in a Series are not NA.
>>> ser = md.Series([5, 6, np.NaN])
>>> ser.execute()
0 5.0
1 6.0
2 NaN
dtype: float64
>>> ser.notna().execute()
0 True
1 True
2 False
dtype: bool
"""
if isinstance(obj, md.MultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
elif isinstance(obj, ENTITY_TYPE):
if isinstance(obj, TENSOR_TYPE):
return ~mt.isnan(obj)
else:
op = DataFrameCheckNA(positive=False)
return op(obj)
else:
return _from_pandas(pd.notna(obj))
isnull = isna
notnull = notna