Source code for xorbits._mars.dataframe.missing.checkna

# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any

import numpy as np
import pandas as pd

from ... import dataframe as md
from ... import opcodes
from ... import tensor as mt
from ...core import OutputType
from ...serialization.serializables import BoolField
from ..operands import (
    DATAFRAME_TYPE,
    ENTITY_TYPE,
    INDEX_TYPE,
    SERIES_TYPE,
    TENSOR_TYPE,
    DataFrameOperand,
    DataFrameOperandMixin,
)


class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.CHECK_NA

    _positive = BoolField("positive")

    def __init__(self, positive=None, sparse=None, output_types=None, **kw):
        super().__init__(
            _positive=positive,
            _output_types=output_types,
            sparse=sparse,
            **kw,
        )

    @property
    def positive(self) -> bool:
        return self._positive

    def __call__(self, df):
        if isinstance(df, DATAFRAME_TYPE):
            self.output_types = [OutputType.dataframe]
        elif isinstance(df, SERIES_TYPE):
            self.output_types = [OutputType.series]
        elif isinstance(df, TENSOR_TYPE) or isinstance(df, INDEX_TYPE):
            self.output_types = [OutputType.tensor]
        else:
            raise TypeError(
                f"Expecting mars dataframe, series, index, or tensor, got {type(df)}"
            )

        params = df.params.copy()
        if self.output_types[0] == OutputType.dataframe:
            params["dtypes"] = pd.Series(
                [np.dtype("bool")] * len(df.dtypes), index=df.columns_value.to_pandas()
            )
        else:
            params["dtype"] = np.dtype("bool")
        return self.new_tileable([df], **params)

    @classmethod
    def tile(cls, op: "DataFrameCheckNA"):
        in_df = op.inputs[0]
        out_df = op.outputs[0]

        chunks = []
        for c in in_df.chunks:
            params = c.params.copy()
            if op.output_types[0] == OutputType.dataframe:
                params["dtypes"] = pd.Series(
                    [np.dtype("bool")] * len(c.dtypes),
                    index=c.columns_value.to_pandas(),
                )
            else:
                params["dtype"] = np.dtype("bool")
            new_op = op.copy().reset_key()
            chunks.append(new_op.new_chunk([c], **params))

        new_op = op.copy().reset_key()
        params = out_df.params.copy()
        params.update(dict(chunks=chunks, nsplits=in_df.nsplits))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def execute(cls, ctx, op: "DataFrameCheckNA"):
        in_data = ctx[op.inputs[0].key]
        if op.positive:
            ctx[op.outputs[0].key] = in_data.isna()
        else:
            ctx[op.outputs[0].key] = in_data.notna()


def _from_pandas(obj: Any):
    if isinstance(obj, pd.DataFrame):
        from ..datasource.dataframe import from_pandas

        return from_pandas(obj)
    elif isinstance(obj, pd.Series):
        from ..datasource.series import from_pandas

        return from_pandas(obj)
    elif isinstance(obj, np.ndarray):
        return mt.tensor(obj)
    else:
        return obj


[docs]def isna(obj): """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. NA values, such as None or :attr:`numpy.NaN`, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings ``''`` or :attr:`numpy.inf` are not considered NA values (unless you set ``pandas.options.mode.use_inf_as_na = True``). Returns ------- DataFrame Mask of bool values for each element in DataFrame that indicates whether an element is not an NA value. See Also -------- DataFrame.isnull : Alias of isna. DataFrame.notna : Boolean inverse of isna. DataFrame.dropna : Omit axes labels with missing values. isna : Top-level isna. Examples -------- Show which entries in a DataFrame are NA. >>> import numpy as np >>> import mars.dataframe as md >>> df = md.DataFrame({'age': [5, 6, np.NaN], ... 'born': [md.NaT, md.Timestamp('1939-05-27'), ... md.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], ... 'toy': [None, 'Batmobile', 'Joker']}) >>> df.execute() age born name toy 0 5.0 NaT Alfred None 1 6.0 1939-05-27 Batman Batmobile 2 NaN 1940-04-25 Joker >>> df.isna().execute() age born name toy 0 False True False True 1 False False False False 2 True False False False Show which entries in a Series are NA. >>> ser = md.Series([5, 6, np.NaN]) >>> ser.execute() 0 5.0 1 6.0 2 NaN dtype: float64 >>> ser.isna().execute() 0 False 1 False 2 True dtype: bool """ if isinstance(obj, md.MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, ENTITY_TYPE): if isinstance(obj, TENSOR_TYPE): return mt.isnan(obj) else: op = DataFrameCheckNA(positive=True) return op(obj) else: return _from_pandas(pd.isna(obj))
[docs]def notna(obj): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. Non-missing values get mapped to True. Characters such as empty strings ``''`` or :attr:`numpy.inf` are not considered NA values (unless you set ``pandas.options.mode.use_inf_as_na = True``). NA values, such as None or :attr:`numpy.NaN`, get mapped to False values. Returns ------- DataFrame Mask of bool values for each element in DataFrame that indicates whether an element is not an NA value. See Also -------- DataFrame.notnull : Alias of notna. DataFrame.isna : Boolean inverse of notna. DataFrame.dropna : Omit axes labels with missing values. notna : Top-level notna. Examples -------- Show which entries in a DataFrame are not NA. >>> import numpy as np >>> import mars.dataframe as md >>> df = md.DataFrame({'age': [5, 6, np.NaN], ... 'born': [md.NaT, md.Timestamp('1939-05-27'), ... md.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], ... 'toy': [None, 'Batmobile', 'Joker']}) >>> df.execute() age born name toy 0 5.0 NaT Alfred None 1 6.0 1939-05-27 Batman Batmobile 2 NaN 1940-04-25 Joker >>> df.notna().execute() age born name toy 0 True False True False 1 True True True True 2 False True True True Show which entries in a Series are not NA. >>> ser = md.Series([5, 6, np.NaN]) >>> ser.execute() 0 5.0 1 6.0 2 NaN dtype: float64 >>> ser.notna().execute() 0 True 1 True 2 False dtype: bool """ if isinstance(obj, md.MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, ENTITY_TYPE): if isinstance(obj, TENSOR_TYPE): return ~mt.isnan(obj) else: op = DataFrameCheckNA(positive=False) return op(obj) else: return _from_pandas(pd.notna(obj))
isnull = isna notnull = notna