Code source de geonature.core.imports.checks.dataframe.cast

from typing import Any, Dict, Iterator, Optional, Set
import re
from uuid import UUID
from itertools import product
from datetime import datetime

from geonature.core.imports.checks.errors import ImportCodeError
import pandas as pd
from sqlalchemy.sql import sqltypes
from sqlalchemy.dialects.postgresql import UUID as UUIDType

from geonature.core.imports.models import BibFields, Entity
from .utils import dataframe_check



[docs]
def convert_to_datetime(value_raw):
    """
    Try to convert a date string to a datetime object.
    If the input string does not match any of compatible formats, it will return
    None.

    Parameters
    ----------
    value_raw : str
        The input string to convert

    Returns
    -------
    converted_date : datetime or None
        The converted datetime object or None if the conversion failed
    """
    converted_date: datetime = None

    value = value_raw.strip()
    value = re.sub("[ ]+", " ", value)
    value = re.sub("[/.:]", "-", value)
    date_formats = [
        "%Y-%m-%d",
        "%d-%m-%Y",
    ]
    time_formats = [
        None,
        "%H",
        "%H-%M",
        "%H-%M-%S",
        "%H-%M-%S-%f",
        "%Hh",
        "%Hh%M",
        "%Hh%Mm",
        "%Hh%Mm%Ss",
    ]
    for date_format, time_format in product(date_formats, time_formats):
        fmt = (date_format + " " + time_format) if time_format else date_format
        try:
            converted_date = datetime.strptime(value, fmt)
            break  # If successful conversion, will stop the loop
        except ValueError:
            continue

    if not converted_date:
        try:
            converted_date = datetime.fromisoformat(value_raw)
        except:
            pass

    return converted_date




[docs]
def convert_to_uuid(value):
    try:
        UUID(str(value))
        return str(value)
    except ValueError:
        return None




[docs]
def convert_to_integer(value):
    try:
        return int(value)
    except ValueError:
        return None




[docs]
def check_datetime_field(
    df: pd.DataFrame, source_field: str, target_field: str, required: bool
) -> Set[str]:
    """
    Check if a column is a datetime and convert it to datetime type.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    source_field : str
        The name of the column to check.
    target_field : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code, the column name, and the invalid rows.

    Returns
    -------
    set
        Set containing the name of the target field.

    Notes
    -----
    The error codes are:
        - INVALID_DATE: the value is not of datetime type.
    """
    datetime_col = df[source_field].apply(lambda x: convert_to_datetime(x) if pd.notnull(x) else x)
    if required:
        invalid_rows = df[datetime_col.isna()]
    else:
        # invalid rows are NaN rows which were not already set to NaN
        invalid_rows = df[datetime_col.isna() & df[source_field].notna()]
    df[target_field] = datetime_col
    values_error = invalid_rows[source_field]
    if len(invalid_rows) > 0:
        yield dict(
            error_code=ImportCodeError.INVALID_DATE,
            invalid_rows=invalid_rows,
            comment="Les dates suivantes ne sont pas au bon format: {}".format(
                ", ".join(map(lambda x: str(x), values_error))
            ),
        )
    return {target_field}




[docs]
def check_uuid_field(
    df: pd.DataFrame, source_field: str, target_field: str, required: bool
) -> Set[str]:
    """
    Check if a column is a UUID and convert it to UUID type.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    source_field : str
        The name of the column to check.
    target_field : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code, the column name, and the invalid rows.

    Returns
    -------
    set
        Set containing the name of the target field.

    Notes
    -----
    The error codes are:
        - INVALID_UUID: the value is not a valid UUID.
    """
    uuid_col = df[source_field].apply(lambda x: convert_to_uuid(x) if pd.notnull(x) else x)
    if required:
        invalid_rows = df[uuid_col.isna()]
    else:
        # invalid rows are NaN rows which were not already set to NaN
        invalid_rows = df[uuid_col.isna() & df[source_field].notna()]
    df[target_field] = uuid_col
    values_error = invalid_rows[source_field]
    if len(invalid_rows) > 0:
        yield dict(
            error_code=ImportCodeError.INVALID_UUID,
            invalid_rows=invalid_rows,
            comment="Les UUID suivantes ne sont pas au bon format: {}".format(
                ", ".join(map(lambda x: str(x), values_error))
            ),
        )
    return {target_field}




[docs]
def check_integer_field(
    df: pd.DataFrame, source_field: str, target_field: str, required: bool
) -> Set[str]:
    """
    Check if a column is an integer and convert it to integer type.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    source_field : str
        The name of the column to check.
    target_field : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code, the column name, and the invalid rows.

    Returns
    -------
    set
        Set containing the name of the target field.

    Notes
    -----
    The error codes are:
        - INVALID_INTEGER: the value is not of integer type.
    """
    integer_col = df[source_field].apply(lambda x: convert_to_integer(x) if pd.notnull(x) else x)
    if required:
        invalid_rows = df[integer_col.isna()]
    else:
        # invalid rows are NaN rows which were not already set to NaN
        invalid_rows = df[integer_col.isna() & df[source_field].notna()]
    df[target_field] = integer_col
    values_error = invalid_rows[source_field]
    if len(invalid_rows) > 0:
        yield dict(
            error_code=ImportCodeError.INVALID_INTEGER,
            invalid_rows=invalid_rows,
            comment="Les valeurs suivantes ne sont pas des entiers : {}".format(
                ", ".join(map(lambda x: str(x), values_error))
            ),
        )
    return {target_field}




[docs]
def check_numeric_field(
    df: pd.DataFrame, source_field: str, target_field: str, required: bool
) -> Set[str]:
    """
    Check if column string values are numerics and convert it to numeric type.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    source_field : str
        The name of the column to check.
    target_field : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code, the column name, and the invalid rows.

    Returns
    -------
    set
        Set containing the name of the target field.

    Notes
    -----
    The error codes are:
        - INVALID_NUMERIC: the value is not of numeric type.
    """

    def to_numeric(x):
        try:
            return float(x)
        except:
            return None

    numeric_col = df[source_field].apply(lambda x: to_numeric(x) if pd.notnull(x) else x)
    if required:
        invalid_rows = df[numeric_col.isna()]
    else:
        # invalid rows are NaN rows which were not already set to NaN
        invalid_rows = df[numeric_col.isna() & df[source_field].notna()]
    df[target_field] = numeric_col
    values_error = invalid_rows[source_field]
    if len(invalid_rows) > 0:
        yield dict(
            error_code=ImportCodeError.INVALID_NUMERIC,
            invalid_rows=invalid_rows,
            comment="Les valeurs suivantes ne sont pas des nombres : {}".format(
                ", ".join(map(lambda x: str(x), values_error))
            ),
        )
    return {target_field}




[docs]
def check_unicode_field(
    df: pd.DataFrame, field: str, field_length: Optional[int]
) -> Iterator[Dict[str, Any]]:
    """
    Check if column values have the right length.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    field : str
        The name of the column to check.
    field_length : Optional[int]
        The maximum length of the column.

    Yields
    ------
    dict
        A dictionary containing an error code, the column name, and the invalid rows.
    Notes
    -----
    The error codes are:
        - INVALID_CHAR_LENGTH: the string is too long.
    """
    if field_length is None:
        return
    length = df[field].apply(lambda x: len(x) if pd.notnull(x) else x)
    invalid_rows = df[length > field_length]
    if len(invalid_rows) > 0:
        yield dict(
            error_code=ImportCodeError.INVALID_CHAR_LENGTH,
            invalid_rows=invalid_rows,
        )




[docs]
def check_boolean_field(df, source_col, dest_col, required):
    """
    Check a boolean field in a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    source_col : str
        The name of the column to check.
    dest_col : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code and the rows with errors.

    Notes
    -----
    The error codes are:
        - MISSING_VALUE: the value is mandatory but it's missing (null).
        - INVALID_BOOL: the value is not a boolean.

    """
    df[dest_col] = df[source_col].apply(int).apply(bool)

    if required:  # FIXME: to remove as done in check_required_value
        invalid_mask = df[dest_col].apply(lambda x: type(x) != bool and pd.isnull(x))
        yield dict(error_code=ImportCodeError.MISSING_VALUE, invalid_rows=df[invalid_mask])
    else:
        invalid_mask = df[dest_col].apply(lambda x: type(x) != bool and (not pd.isnull(x)))
        if invalid_mask.sum() > 0:
            yield dict(error_code=ImportCodeError.INVALID_BOOL, invalid_rows=df[invalid_mask])
    return {dest_col}




[docs]
def check_anytype_field(
    df: pd.DataFrame,
    field_type: sqltypes.TypeEngine,
    source_col: str,
    dest_col: str,
    required: bool,
) -> Set[str]:
    """
    Check a field in a dataframe according to its type.

    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to check.
    field_type : sqlalchemy.TypeEngine
        The type of the column to check.
    source_col : str
        The name of the column to check.
    dest_col : str
        The name of the column where to store the result.
    required : bool
        Whether the column is mandatory or not.

    Yields
    ------
    dict
        A dictionary containing an error code and the rows with errors.

    Returns
    -------
    set
        Set containing the name of columns updated in the dataframe.
    """
    updated_cols = set()
    if isinstance(field_type, sqltypes.DateTime):
        updated_cols |= yield from check_datetime_field(df, source_col, dest_col, required)
    elif isinstance(field_type, sqltypes.Integer):
        updated_cols |= yield from check_integer_field(df, source_col, dest_col, required)
    elif isinstance(field_type, UUIDType):
        updated_cols |= yield from check_uuid_field(df, source_col, dest_col, required)
    elif isinstance(field_type, sqltypes.String):
        yield from check_unicode_field(df, dest_col, field_length=field_type.length)
    elif isinstance(field_type, sqltypes.Boolean):
        updated_cols |= yield from check_boolean_field(df, source_col, dest_col, required)
    elif isinstance(field_type, sqltypes.Numeric):
        updated_cols |= yield from check_numeric_field(df, source_col, dest_col, required)
    else:
        raise Exception(
            "Unknown type {} for field {}".format(type(field_type), dest_col)
        )  # pragma: no cover
    return updated_cols



@dataframe_check

[docs]
def check_types(entity: Entity, df: pd.DataFrame, fields: Dict[str, BibFields]) -> Set[str]:
    """
    Check the types of columns in a dataframe based on the provided fields.

    Parameters
    ----------
    entity : Entity
        The entity to check.
    df : pd.DataFrame
        The dataframe to check.
    fields : Dict[str, BibFields]
        A dictionary mapping column names to their corresponding BibFields.

    Returns
    -------
    Set[str]
        Set containing the names of updated columns.
    """
    updated_cols = set()
    destination_table = entity.get_destination_table()
    transient_table = entity.destination.get_transient_table()
    for name, field in fields.items():
        if not field.dest_field:
            continue
        if field.source_column not in df:
            continue
        if field.mnemonique:  # set from content mapping
            continue
        assert entity in [ef.entity for ef in field.entities]  # FIXME
        if field.dest_field in destination_table.c:
            field_type = destination_table.c[field.dest_field].type
        else:  # we may require to convert some columns unused in final destination
            field_type = transient_table.c[field.dest_field].type
        updated_cols |= yield from map(
            lambda error: {"column": name, **error},
            check_anytype_field(
                df,
                field_type=field_type,
                source_col=field.source_column,
                dest_col=field.dest_field,
                required=False,
            ),
        )
    return updated_cols