xorbits._mars.dataframe.missing.checkna 源代码

# Copyright 2022-2023 XProbe Inc.
# derived from copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any

import numpy as np
import pandas as pd

from ... import dataframe as md
from ... import opcodes
from ... import tensor as mt
from ...core import OutputType
from ...serialization.serializables import BoolField
from ..operands import (
    DATAFRAME_TYPE,
    ENTITY_TYPE,
    INDEX_TYPE,
    SERIES_TYPE,
    TENSOR_TYPE,
    DataFrameOperand,
    DataFrameOperandMixin,
)


class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.CHECK_NA

    _positive = BoolField("positive")

    def __init__(self, positive=None, sparse=None, output_types=None, **kw):
        super().__init__(
            _positive=positive,
            _output_types=output_types,
            sparse=sparse,
            **kw,
        )

    @property
    def positive(self) -> bool:
        return self._positive

    def __call__(self, df):
        if isinstance(df, DATAFRAME_TYPE):
            self.output_types = [OutputType.dataframe]
        elif isinstance(df, SERIES_TYPE):
            self.output_types = [OutputType.series]
        elif isinstance(df, TENSOR_TYPE) or isinstance(df, INDEX_TYPE):
            self.output_types = [OutputType.tensor]
        else:
            raise TypeError(
                f"Expecting mars dataframe, series, index, or tensor, got {type(df)}"
            )

        params = df.params.copy()
        if self.output_types[0] == OutputType.dataframe:
            params["dtypes"] = pd.Series(
                [np.dtype("bool")] * len(df.dtypes), index=df.columns_value.to_pandas()
            )
        else:
            params["dtype"] = np.dtype("bool")
        return self.new_tileable([df], **params)

    @classmethod
    def tile(cls, op: "DataFrameCheckNA"):
        in_df = op.inputs[0]
        out_df = op.outputs[0]

        chunks = []
        for c in in_df.chunks:
            params = c.params.copy()
            if op.output_types[0] == OutputType.dataframe:
                params["dtypes"] = pd.Series(
                    [np.dtype("bool")] * len(c.dtypes),
                    index=c.columns_value.to_pandas(),
                )
            else:
                params["dtype"] = np.dtype("bool")
            new_op = op.copy().reset_key()
            chunks.append(new_op.new_chunk([c], **params))

        new_op = op.copy().reset_key()
        params = out_df.params.copy()
        params.update(dict(chunks=chunks, nsplits=in_df.nsplits))
        return new_op.new_tileables([in_df], **params)

    @classmethod
    def execute(cls, ctx, op: "DataFrameCheckNA"):
        in_data = ctx[op.inputs[0].key]
        if op.positive:
            ctx[op.outputs[0].key] = in_data.isna()
        else:
            ctx[op.outputs[0].key] = in_data.notna()


def _from_pandas(obj: Any):
    if isinstance(obj, pd.DataFrame):
        from ..datasource.dataframe import from_pandas

        return from_pandas(obj)
    elif isinstance(obj, pd.Series):
        from ..datasource.series import from_pandas

        return from_pandas(obj)
    elif isinstance(obj, np.ndarray):
        return mt.tensor(obj)
    else:
        return obj


[文档]def isna(obj):
    """
    Detect missing values.

    Return a boolean same-sized object indicating if the values are NA.
    NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
    values.

    Everything else gets mapped to False values. Characters such as empty
    strings ``''`` or :attr:`numpy.inf` are not considered NA values
    (unless you set ``pandas.options.mode.use_inf_as_na = True``).

    Returns
    -------
    DataFrame
        Mask of bool values for each element in DataFrame that
        indicates whether an element is not an NA value.

    See Also
    --------
    DataFrame.isnull : Alias of isna.
    DataFrame.notna : Boolean inverse of isna.
    DataFrame.dropna : Omit axes labels with missing values.
    isna : Top-level isna.

    Examples
    --------
    Show which entries in a DataFrame are NA.

    >>> import numpy as np
    >>> import mars.dataframe as md
    >>> df = md.DataFrame({'age': [5, 6, np.NaN],
    ...                    'born': [md.NaT, md.Timestamp('1939-05-27'),
    ...                             md.Timestamp('1940-04-25')],
    ...                    'name': ['Alfred', 'Batman', ''],
    ...                    'toy': [None, 'Batmobile', 'Joker']})
    >>> df.execute()
       age       born    name        toy
    0  5.0        NaT  Alfred       None
    1  6.0 1939-05-27  Batman  Batmobile
    2  NaN 1940-04-25              Joker

    >>> df.isna().execute()
         age   born   name    toy
    0  False   True  False   True
    1  False  False  False  False
    2   True  False  False  False

    Show which entries in a Series are NA.

    >>> ser = md.Series([5, 6, np.NaN])
    >>> ser.execute()
    0    5.0
    1    6.0
    2    NaN
    dtype: float64

    >>> ser.isna().execute()
    0    False
    1    False
    2     True
    dtype: bool
    """
    if isinstance(obj, md.MultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, ENTITY_TYPE):
        if isinstance(obj, TENSOR_TYPE):
            return mt.isnan(obj)
        else:
            op = DataFrameCheckNA(positive=True)
            return op(obj)
    else:
        return _from_pandas(pd.isna(obj))


[文档]def notna(obj):
    """
    Detect existing (non-missing) values.

    Return a boolean same-sized object indicating if the values are not NA.
    Non-missing values get mapped to True. Characters such as empty
    strings ``''`` or :attr:`numpy.inf` are not considered NA values
    (unless you set ``pandas.options.mode.use_inf_as_na = True``).
    NA values, such as None or :attr:`numpy.NaN`, get mapped to False
    values.

    Returns
    -------
    DataFrame
        Mask of bool values for each element in DataFrame that
        indicates whether an element is not an NA value.

    See Also
    --------
    DataFrame.notnull : Alias of notna.
    DataFrame.isna : Boolean inverse of notna.
    DataFrame.dropna : Omit axes labels with missing values.
    notna : Top-level notna.

    Examples
    --------
    Show which entries in a DataFrame are not NA.

    >>> import numpy as np
    >>> import mars.dataframe as md
    >>> df = md.DataFrame({'age': [5, 6, np.NaN],
    ...                    'born': [md.NaT, md.Timestamp('1939-05-27'),
    ...                             md.Timestamp('1940-04-25')],
    ...                    'name': ['Alfred', 'Batman', ''],
    ...                    'toy': [None, 'Batmobile', 'Joker']})
    >>> df.execute()
       age       born    name        toy
    0  5.0        NaT  Alfred       None
    1  6.0 1939-05-27  Batman  Batmobile
    2  NaN 1940-04-25              Joker

    >>> df.notna().execute()
         age   born  name    toy
    0   True  False  True  False
    1   True   True  True   True
    2  False   True  True   True

    Show which entries in a Series are not NA.

    >>> ser = md.Series([5, 6, np.NaN])
    >>> ser.execute()
    0    5.0
    1    6.0
    2    NaN
    dtype: float64

    >>> ser.notna().execute()
    0     True
    1     True
    2    False
    dtype: bool
    """
    if isinstance(obj, md.MultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, ENTITY_TYPE):
        if isinstance(obj, TENSOR_TYPE):
            return ~mt.isnan(obj)
        else:
            op = DataFrameCheckNA(positive=False)
            return op(obj)
    else:
        return _from_pandas(pd.notna(obj))


isnull = isna
notnull = notna