Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions hashprep/checks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns
from .core import Issue as Issue
from .correlations import calculate_correlations
from .datetime_checks import _check_datetime_future_dates, _check_datetime_gaps, _check_datetime_monotonicity
from .distribution import _check_uniform_distribution, _check_unique_values
from .drift import check_drift
from .imbalance import _check_class_imbalance
Expand Down Expand Up @@ -45,6 +46,9 @@ def _check_dataset_drift(analyzer):
"high_zero_counts": _check_high_zero_counts,
"extreme_text_lengths": _check_extreme_text_lengths,
"datetime_skew": _check_datetime_skew,
"datetime_future_dates": _check_datetime_future_dates,
"datetime_gaps": _check_datetime_gaps,
"datetime_monotonicity": _check_datetime_monotonicity,
"missing_patterns": _check_missing_patterns,
"skewness": _check_skewness,
"dataset_drift": _check_dataset_drift,
Expand Down
153 changes: 153 additions & 0 deletions hashprep/checks/datetime_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import numpy as np
import pandas as pd

from ..config import DEFAULT_CONFIG
from .core import Issue

_DT_CFG = DEFAULT_CONFIG.datetime


def _coerce_datetime(series: pd.Series) -> pd.Series:
"""Return a datetime Series regardless of whether the source is datetime64 or object."""
if pd.api.types.is_datetime64_any_dtype(series):
return series.dropna()
return pd.to_datetime(series, errors="coerce").dropna()


def _datetime_cols(analyzer) -> list[str]:
"""Return columns inferred as DateTime."""
return [col for col, typ in analyzer.column_types.items() if typ == "DateTime"]


def _check_datetime_future_dates(analyzer) -> list[Issue]:
"""Flag datetime columns that contain values in the future (likely data errors)."""
issues = []
now = pd.Timestamp.now()

for col in _datetime_cols(analyzer):
dt = _coerce_datetime(analyzer.df[col])
if dt.empty:
continue

future_count = int((dt > now).sum())
if future_count == 0:
continue

future_ratio = future_count / len(dt)
severity = "critical" if future_ratio > _DT_CFG.future_date_critical_ratio else "warning"
impact = "high" if severity == "critical" else "medium"
issues.append(
Issue(
category="datetime_future_dates",
severity=severity,
column=col,
description=(
f"Column '{col}' has {future_count} future-dated values "
f"({future_ratio:.1%} of non-missing) — latest: {dt.max().date()}"
),
impact_score=impact,
quick_fix=(
"Options:\n"
"- Investigate source: Future dates often indicate data entry errors or clock skew.\n"
"- Cap to present: Replace future dates with today or NaN.\n"
"- Exclude rows: Drop records with future timestamps before training."
),
)
)
return issues


def _check_datetime_gaps(analyzer) -> list[Issue]:
"""Detect anomalously large gaps in datetime columns (broken time series)."""
issues = []

for col in _datetime_cols(analyzer):
dt = _coerce_datetime(analyzer.df[col]).sort_values()
if len(dt) < _DT_CFG.min_rows_for_gap_check:
continue

diffs = dt.diff().dropna()
if diffs.empty:
continue

# Work in total seconds for a unit-agnostic comparison
diff_seconds = diffs.dt.total_seconds()
median_gap = float(diff_seconds.median())
if median_gap <= 0:
continue

max_gap = float(diff_seconds.max())
ratio = max_gap / median_gap

if ratio >= _DT_CFG.gap_multiplier_warning:
severity = "critical" if ratio >= _DT_CFG.gap_multiplier_critical else "warning"
impact = "high" if severity == "critical" else "medium"

# Locate the gap for a human-readable description
gap_idx = int(np.argmax(diff_seconds.values))
gap_start = dt.iloc[gap_idx]
gap_end = dt.iloc[gap_idx + 1]
gap_days = (gap_end - gap_start).days

issues.append(
Issue(
category="datetime_gaps",
severity=severity,
column=col,
description=(
f"Column '{col}' has an anomalous gap of {gap_days} days "
f"({ratio:.0f}× the median gap) between {gap_start.date()} and {gap_end.date()}"
),
impact_score=impact,
quick_fix=(
"Options:\n"
"- Investigate gap: May indicate missing data collection periods.\n"
"- Impute missing periods: Forward-fill or interpolate for time-series models.\n"
"- Flag as a feature: Create a binary 'gap_present' indicator.\n"
"- Segment model: Train separate models for each contiguous period."
),
)
)
return issues


def _check_datetime_monotonicity(analyzer) -> list[Issue]:
"""Warn when a datetime column that looks like a time-series index is non-monotonic."""
issues = []

for col in _datetime_cols(analyzer):
dt = _coerce_datetime(analyzer.df[col])
if len(dt) < _DT_CFG.min_rows_for_gap_check:
continue

# Only flag if the column has mostly unique values (i.e., likely an index/timestamp)
unique_ratio = dt.nunique() / len(dt)
if unique_ratio < 0.9:
continue

if not (dt.is_monotonic_increasing or dt.is_monotonic_decreasing):
# Count out-of-order entries
sorted_dt = dt.sort_values()
out_of_order = int((dt.reset_index(drop=True) != sorted_dt.reset_index(drop=True)).sum())
out_ratio = out_of_order / len(dt)
severity = "warning"
impact = "medium"
issues.append(
Issue(
category="datetime_monotonicity",
severity=severity,
column=col,
description=(
f"Column '{col}' is non-monotonic: {out_of_order} rows "
f"({out_ratio:.1%}) are out of temporal order"
),
impact_score=impact,
quick_fix=(
"Options:\n"
"- Sort by this column: Restores temporal order before time-series modeling.\n"
"- Investigate duplicates: Non-monotonicity may reveal duplicate or misaligned records.\n"
"- Retain if intentional: Some datasets (e.g., event logs) are legitimately unordered."
),
)
)
return issues
16 changes: 16 additions & 0 deletions hashprep/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,21 @@ class ImbalanceThresholds:
majority_class_ratio: float = 0.9


@dataclass(frozen=True)
class DateTimeThresholds:
"""Thresholds for datetime-specific checks."""

# Ratio of parseable values to classify an object column as DateTime
parse_threshold: float = 0.8
# Any future-dated values trigger a warning (ratio > 0 → warn, ratio > this → critical)
future_date_critical_ratio: float = 0.05
# A gap is anomalous if it exceeds this multiple of the median gap
gap_multiplier_warning: float = 5.0
gap_multiplier_critical: float = 20.0
# Minimum number of rows needed to run gap/monotonicity checks
min_rows_for_gap_check: int = 10


@dataclass(frozen=True)
class TypeInferenceConfig:
"""Configuration for type inference."""
Expand Down Expand Up @@ -175,6 +190,7 @@ class HashPrepConfig:
drift: DriftThresholds = field(default_factory=DriftThresholds)
distribution: DistributionThresholds = field(default_factory=DistributionThresholds)
imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds)
datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds)
type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig)
sampling: SamplingDefaults = field(default_factory=SamplingDefaults)
summaries: SummaryDefaults = field(default_factory=SummaryDefaults)
Expand Down
3 changes: 3 additions & 0 deletions hashprep/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class DatasetAnalyzer:
"high_zero_counts",
"extreme_text_lengths",
"datetime_skew",
"datetime_future_dates",
"datetime_gaps",
"datetime_monotonicity",
"missing_patterns",
"skewness",
"dataset_drift",
Expand Down
56 changes: 50 additions & 6 deletions hashprep/summaries/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,14 @@ def _summarize_categorical(df, col):


def _summarize_datetime(df, col):
dt_series = pd.to_datetime(df[col], errors="coerce")
if pd.api.types.is_datetime64_any_dtype(df[col]):
dt_series = df[col]
parse_fails = 0
else:
dt_series = pd.to_datetime(df[col], errors="coerce")
parse_fails = int((dt_series.isna() & df[col].notna()).sum())

valid_series = dt_series.dropna()
parse_fails = int((dt_series.isna() & df[col].notna()).sum())
invalid_percentage = (parse_fails / len(df) * 100) if len(df) > 0 else 0.0

if valid_series.empty:
Expand All @@ -337,27 +342,66 @@ def _summarize_datetime(df, col):
"invalid_count": parse_fails,
"invalid_percentage": invalid_percentage,
"counts": None,
"gap_stats": None,
"monotonicity": None,
"future_count": None,
}

min_dt = valid_series.min()
max_dt = valid_series.max()
range_delta = max_dt - min_dt
now = pd.Timestamp.now()

year_counts = {int(k): int(v) for k, v in valid_series.dt.year.value_counts().items()}
month_counts = {int(k): int(v) for k, v in valid_series.dt.month.value_counts().items()}
weekday_counts = {int(k): int(v) for k, v in valid_series.dt.dayofweek.value_counts().items()}
day_counts = {int(k): int(v) for k, v in valid_series.dt.day.value_counts().items()}

# Sub-day precision: include hour distribution if values have non-zero hours
has_time = bool((valid_series.dt.hour != 0).any() or (valid_series.dt.minute != 0).any())
hour_counts = {int(k): int(v) for k, v in valid_series.dt.hour.value_counts().items()} if has_time else None

# Gap statistics (sorted diffs)
sorted_series = valid_series.sort_values()
diffs = sorted_series.diff().dropna()
gap_stats = None
if len(diffs) > 0:
diff_seconds = diffs.dt.total_seconds()
gap_stats = {
"median_gap_seconds": float(diff_seconds.median()),
"max_gap_seconds": float(diff_seconds.max()),
"min_gap_seconds": float(diff_seconds.min()),
"mean_gap_seconds": float(diff_seconds.mean()),
}

# Monotonicity
if valid_series.is_monotonic_increasing:
monotonicity = "increasing"
elif valid_series.is_monotonic_decreasing:
monotonicity = "decreasing"
else:
monotonicity = "non-monotonic"

year_counts = valid_series.dt.year.value_counts().to_dict()
month_counts = valid_series.dt.month.value_counts().to_dict()
day_counts = valid_series.dt.day.value_counts().to_dict()
# Future dates
future_count = int((valid_series > now).sum())

stats = {
"minimum": str(min_dt),
"maximum": str(max_dt),
"range_days": int(range_delta.days),
"range_str": str(range_delta),
"invalid_count": parse_fails,
"invalid_percentage": invalid_percentage,
"invalid_percentage": float(invalid_percentage),
"future_count": future_count,
"monotonicity": monotonicity,
"has_time_component": has_time,
"gap_stats": gap_stats,
"counts": {
"years": year_counts,
"months": month_counts,
"weekdays": weekday_counts,
"days": day_counts,
"hours": hour_counts,
},
}
return stats
Expand Down
29 changes: 25 additions & 4 deletions hashprep/utils/type_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ..config import DEFAULT_CONFIG

_TYPE_CFG = DEFAULT_CONFIG.type_inference
_DT_CFG = DEFAULT_CONFIG.datetime
CONFIG = {
"cat_cardinality_threshold": _TYPE_CFG.cat_cardinality_threshold,
"cat_percentage_threshold": _TYPE_CFG.cat_percentage_threshold,
Expand All @@ -11,10 +12,20 @@
}


def _looks_like_datetime(series: pd.Series) -> bool:
"""Return True if an object/string column parses as datetime above the threshold."""
sample = series.dropna().head(200)
if len(sample) == 0:
return False
parsed = pd.to_datetime(sample, errors="coerce")
parse_ratio = parsed.notna().mean()
return float(parse_ratio) >= _DT_CFG.parse_threshold


def infer_types(df: pd.DataFrame) -> dict[str, str]:
"""
Infer semantic types per ydata logic.
Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'Unsupported'}
Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'DateTime' | 'Boolean' | 'Unsupported'}
"""
types = {}
for col in df.columns:
Expand All @@ -23,21 +34,31 @@ def infer_types(df: pd.DataFrame) -> dict[str, str]:
types[col] = "Unsupported"
continue

# DateTime: native datetime64 dtype
if pd.api.types.is_datetime64_any_dtype(series):
types[col] = "DateTime"

# Numeric inference (ydata's Numeric.contains_op + numeric_is_category)
if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
elif pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
n_unique = series.nunique()
if 1 <= n_unique <= CONFIG["num_low_cat_threshold"]:
types[col] = "Categorical" # Low-card numeric → Categorical (e.g., SibSp, Parch)
else:
types[col] = "Numeric" # High-card numeric (e.g., Age, Fare)

# Boolean dtype
elif pd.api.types.is_bool_dtype(series):
types[col] = "Categorical"

# String/Text inference (ydata's Text.contains_op + string_is_category)
elif pd.api.types.is_string_dtype(series) or pd.api.types.is_object_dtype(series):
n_unique = series.nunique()
unique_pct = n_unique / len(series)
is_bool = all(s.lower() in CONFIG["bool_mappings"] for s in series[:5]) # Quick bool check
is_bool = all(str(s).lower() in CONFIG["bool_mappings"] for s in series[:5])
if is_bool:
types[col] = "Categorical" # Bool-like → Categorical
types[col] = "Categorical"
elif _looks_like_datetime(series):
types[col] = "DateTime" # String dates → DateTime (checked before cardinality)
elif (
1 <= n_unique <= CONFIG["cat_cardinality_threshold"] and unique_pct < CONFIG["cat_percentage_threshold"]
):
Expand Down
Loading