From cf8aeea2c6834fd4746a12f4a20f1545ec9df595 Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Mon, 2 Mar 2026 22:40:12 +0530
Subject: [PATCH 1/2] feat: add first-class DateTime support to type inference,
 checks, and summaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- type_inference.py: detect datetime64 dtypes and parse object columns
  (>80% parseable via pd.to_datetime → classified as "DateTime"); datetime
  detection runs before cardinality-based categorical check so date columns
  with few unique values are not misclassified
- config.py: add DateTimeThresholds (parse threshold, future-date ratio,
  gap multipliers, min rows) wired into HashPrepConfig
- checks/datetime_checks.py: three new checks —
    - datetime_future_dates: flags values timestamped in the future
    - datetime_gaps: detects anomalously large gaps vs. median gap
    - datetime_monotonicity: warns when a high-uniqueness date column is
      out of temporal order
- summaries/variables.py: enhance _summarize_datetime with weekday
  distribution, hour distribution (sub-day precision), gap statistics,
  monotonicity, future_count, and has_time_component flag; removed
  deprecated infer_datetime_format argument across all callers
- checks/__init__.py + core/analyzer.py: register three new checks in
  CHECKS registry and ALL_CHECKS list
- tests/test_datetime.py: 31 tests covering type inference, each check
  unit, summary fields, and end-to-end DatasetAnalyzer integration
---
 hashprep/checks/__init__.py        |   4 +
 hashprep/checks/datetime_checks.py | 153 +++++++++++++++
 hashprep/config.py                 |  16 ++
 hashprep/core/analyzer.py          |   3 +
 hashprep/summaries/variables.py    |  56 +++++-
 hashprep/utils/type_inference.py   |  29 ++-
 tests/test_datetime.py             | 290 +++++++++++++++++++++++++++++
 7 files changed, 541 insertions(+), 10 deletions(-)
 create mode 100644 hashprep/checks/datetime_checks.py
 create mode 100644 tests/test_datetime.py

diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py
index f44230f..ccd8379 100644
--- a/hashprep/checks/__init__.py
+++ b/hashprep/checks/__init__.py
@@ -1,6 +1,7 @@
 from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns
 from .core import Issue as Issue
 from .correlations import calculate_correlations
+from .datetime_checks import _check_datetime_future_dates, _check_datetime_gaps, _check_datetime_monotonicity
 from .distribution import _check_uniform_distribution, _check_unique_values
 from .drift import check_drift
 from .imbalance import _check_class_imbalance
@@ -45,6 +46,9 @@ def _check_dataset_drift(analyzer):
     "high_zero_counts": _check_high_zero_counts,
     "extreme_text_lengths": _check_extreme_text_lengths,
     "datetime_skew": _check_datetime_skew,
+    "datetime_future_dates": _check_datetime_future_dates,
+    "datetime_gaps": _check_datetime_gaps,
+    "datetime_monotonicity": _check_datetime_monotonicity,
     "missing_patterns": _check_missing_patterns,
     "skewness": _check_skewness,
     "dataset_drift": _check_dataset_drift,
diff --git a/hashprep/checks/datetime_checks.py b/hashprep/checks/datetime_checks.py
new file mode 100644
index 0000000..6c5abfd
--- /dev/null
+++ b/hashprep/checks/datetime_checks.py
@@ -0,0 +1,153 @@
+import numpy as np
+import pandas as pd
+
+from ..config import DEFAULT_CONFIG
+from .core import Issue
+
+_DT_CFG = DEFAULT_CONFIG.datetime
+
+
+def _coerce_datetime(series: pd.Series) -> pd.Series:
+    """Return a datetime Series regardless of whether the source is datetime64 or object."""
+    if pd.api.types.is_datetime64_any_dtype(series):
+        return series.dropna()
+    return pd.to_datetime(series, errors="coerce").dropna()
+
+
+def _datetime_cols(analyzer) -> list[str]:
+    """Return columns inferred as DateTime."""
+    return [col for col, typ in analyzer.column_types.items() if typ == "DateTime"]
+
+
+def _check_datetime_future_dates(analyzer) -> list[Issue]:
+    """Flag datetime columns that contain values in the future (likely data errors)."""
+    issues = []
+    now = pd.Timestamp.now()
+
+    for col in _datetime_cols(analyzer):
+        dt = _coerce_datetime(analyzer.df[col])
+        if dt.empty:
+            continue
+
+        future_count = int((dt > now).sum())
+        if future_count == 0:
+            continue
+
+        future_ratio = future_count / len(dt)
+        severity = "critical" if future_ratio > _DT_CFG.future_date_critical_ratio else "warning"
+        impact = "high" if severity == "critical" else "medium"
+        issues.append(
+            Issue(
+                category="datetime_future_dates",
+                severity=severity,
+                column=col,
+                description=(
+                    f"Column '{col}' has {future_count} future-dated values "
+                    f"({future_ratio:.1%} of non-missing) — latest: {dt.max().date()}"
+                ),
+                impact_score=impact,
+                quick_fix=(
+                    "Options:\n"
+                    "- Investigate source: Future dates often indicate data entry errors or clock skew.\n"
+                    "- Cap to present: Replace future dates with today or NaN.\n"
+                    "- Exclude rows: Drop records with future timestamps before training."
+                ),
+            )
+        )
+    return issues
+
+
+def _check_datetime_gaps(analyzer) -> list[Issue]:
+    """Detect anomalously large gaps in datetime columns (broken time series)."""
+    issues = []
+
+    for col in _datetime_cols(analyzer):
+        dt = _coerce_datetime(analyzer.df[col]).sort_values()
+        if len(dt) < _DT_CFG.min_rows_for_gap_check:
+            continue
+
+        diffs = dt.diff().dropna()
+        if diffs.empty:
+            continue
+
+        # Work in total seconds for a unit-agnostic comparison
+        diff_seconds = diffs.dt.total_seconds()
+        median_gap = float(diff_seconds.median())
+        if median_gap <= 0:
+            continue
+
+        max_gap = float(diff_seconds.max())
+        ratio = max_gap / median_gap
+
+        if ratio >= _DT_CFG.gap_multiplier_warning:
+            severity = "critical" if ratio >= _DT_CFG.gap_multiplier_critical else "warning"
+            impact = "high" if severity == "critical" else "medium"
+
+            # Locate the gap for a human-readable description
+            gap_idx = int(np.argmax(diff_seconds.values))
+            gap_start = dt.iloc[gap_idx]
+            gap_end = dt.iloc[gap_idx + 1]
+            gap_days = (gap_end - gap_start).days
+
+            issues.append(
+                Issue(
+                    category="datetime_gaps",
+                    severity=severity,
+                    column=col,
+                    description=(
+                        f"Column '{col}' has an anomalous gap of {gap_days} days "
+                        f"({ratio:.0f}× the median gap) between {gap_start.date()} and {gap_end.date()}"
+                    ),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Investigate gap: May indicate missing data collection periods.\n"
+                        "- Impute missing periods: Forward-fill or interpolate for time-series models.\n"
+                        "- Flag as a feature: Create a binary 'gap_present' indicator.\n"
+                        "- Segment model: Train separate models for each contiguous period."
+                    ),
+                )
+            )
+    return issues
+
+
+def _check_datetime_monotonicity(analyzer) -> list[Issue]:
+    """Warn when a datetime column that looks like a time-series index is non-monotonic."""
+    issues = []
+
+    for col in _datetime_cols(analyzer):
+        dt = _coerce_datetime(analyzer.df[col])
+        if len(dt) < _DT_CFG.min_rows_for_gap_check:
+            continue
+
+        # Only flag if the column has mostly unique values (i.e., likely an index/timestamp)
+        unique_ratio = dt.nunique() / len(dt)
+        if unique_ratio < 0.9:
+            continue
+
+        if not (dt.is_monotonic_increasing or dt.is_monotonic_decreasing):
+            # Count out-of-order entries
+            sorted_dt = dt.sort_values()
+            out_of_order = int((dt.reset_index(drop=True) != sorted_dt.reset_index(drop=True)).sum())
+            out_ratio = out_of_order / len(dt)
+            severity = "warning"
+            impact = "medium"
+            issues.append(
+                Issue(
+                    category="datetime_monotonicity",
+                    severity=severity,
+                    column=col,
+                    description=(
+                        f"Column '{col}' is non-monotonic: {out_of_order} rows "
+                        f"({out_ratio:.1%}) are out of temporal order"
+                    ),
+                    impact_score=impact,
+                    quick_fix=(
+                        "Options:\n"
+                        "- Sort by this column: Restores temporal order before time-series modeling.\n"
+                        "- Investigate duplicates: Non-monotonicity may reveal duplicate or misaligned records.\n"
+                        "- Retain if intentional: Some datasets (e.g., event logs) are legitimately unordered."
+                    ),
+                )
+            )
+    return issues
diff --git a/hashprep/config.py b/hashprep/config.py
index d62d3d9..5c97a4f 100644
--- a/hashprep/config.py
+++ b/hashprep/config.py
@@ -126,6 +126,21 @@ class ImbalanceThresholds:
     majority_class_ratio: float = 0.9
 
 
+@dataclass(frozen=True)
+class DateTimeThresholds:
+    """Thresholds for datetime-specific checks."""
+
+    # Ratio of parseable values to classify an object column as DateTime
+    parse_threshold: float = 0.8
+    # Any future-dated values trigger a warning (ratio > 0 → warn, ratio > this → critical)
+    future_date_critical_ratio: float = 0.05
+    # A gap is anomalous if it exceeds this multiple of the median gap
+    gap_multiplier_warning: float = 5.0
+    gap_multiplier_critical: float = 20.0
+    # Minimum number of rows needed to run gap/monotonicity checks
+    min_rows_for_gap_check: int = 10
+
+
 @dataclass(frozen=True)
 class TypeInferenceConfig:
     """Configuration for type inference."""
@@ -175,6 +190,7 @@ class HashPrepConfig:
     drift: DriftThresholds = field(default_factory=DriftThresholds)
     distribution: DistributionThresholds = field(default_factory=DistributionThresholds)
     imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds)
+    datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds)
     type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig)
     sampling: SamplingDefaults = field(default_factory=SamplingDefaults)
     summaries: SummaryDefaults = field(default_factory=SummaryDefaults)
diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py
index 31e4161..ab696c8 100644
--- a/hashprep/core/analyzer.py
+++ b/hashprep/core/analyzer.py
@@ -49,6 +49,9 @@ class DatasetAnalyzer:
         "high_zero_counts",
         "extreme_text_lengths",
         "datetime_skew",
+        "datetime_future_dates",
+        "datetime_gaps",
+        "datetime_monotonicity",
         "missing_patterns",
         "skewness",
         "dataset_drift",
diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py
index 8346a74..a23ef07 100644
--- a/hashprep/summaries/variables.py
+++ b/hashprep/summaries/variables.py
@@ -324,9 +324,14 @@ def _summarize_categorical(df, col):
 
 
 def _summarize_datetime(df, col):
-    dt_series = pd.to_datetime(df[col], errors="coerce")
+    if pd.api.types.is_datetime64_any_dtype(df[col]):
+        dt_series = df[col]
+        parse_fails = 0
+    else:
+        dt_series = pd.to_datetime(df[col], errors="coerce")
+        parse_fails = int((dt_series.isna() & df[col].notna()).sum())
+
     valid_series = dt_series.dropna()
-    parse_fails = int((dt_series.isna() & df[col].notna()).sum())
     invalid_percentage = (parse_fails / len(df) * 100) if len(df) > 0 else 0.0
 
     if valid_series.empty:
@@ -337,15 +342,48 @@ def _summarize_datetime(df, col):
             "invalid_count": parse_fails,
             "invalid_percentage": invalid_percentage,
             "counts": None,
+            "gap_stats": None,
+            "monotonicity": None,
+            "future_count": None,
         }
 
     min_dt = valid_series.min()
     max_dt = valid_series.max()
     range_delta = max_dt - min_dt
+    now = pd.Timestamp.now()
+
+    year_counts = {int(k): int(v) for k, v in valid_series.dt.year.value_counts().items()}
+    month_counts = {int(k): int(v) for k, v in valid_series.dt.month.value_counts().items()}
+    weekday_counts = {int(k): int(v) for k, v in valid_series.dt.dayofweek.value_counts().items()}
+    day_counts = {int(k): int(v) for k, v in valid_series.dt.day.value_counts().items()}
+
+    # Sub-day precision: include hour distribution if values have non-zero hours
+    has_time = bool((valid_series.dt.hour != 0).any() or (valid_series.dt.minute != 0).any())
+    hour_counts = {int(k): int(v) for k, v in valid_series.dt.hour.value_counts().items()} if has_time else None
+
+    # Gap statistics (sorted diffs)
+    sorted_series = valid_series.sort_values()
+    diffs = sorted_series.diff().dropna()
+    gap_stats = None
+    if len(diffs) > 0:
+        diff_seconds = diffs.dt.total_seconds()
+        gap_stats = {
+            "median_gap_seconds": float(diff_seconds.median()),
+            "max_gap_seconds": float(diff_seconds.max()),
+            "min_gap_seconds": float(diff_seconds.min()),
+            "mean_gap_seconds": float(diff_seconds.mean()),
+        }
+
+    # Monotonicity
+    if valid_series.is_monotonic_increasing:
+        monotonicity = "increasing"
+    elif valid_series.is_monotonic_decreasing:
+        monotonicity = "decreasing"
+    else:
+        monotonicity = "non-monotonic"
 
-    year_counts = valid_series.dt.year.value_counts().to_dict()
-    month_counts = valid_series.dt.month.value_counts().to_dict()
-    day_counts = valid_series.dt.day.value_counts().to_dict()
+    # Future dates
+    future_count = int((valid_series > now).sum())
 
     stats = {
         "minimum": str(min_dt),
@@ -353,11 +391,17 @@ def _summarize_datetime(df, col):
         "range_days": int(range_delta.days),
         "range_str": str(range_delta),
         "invalid_count": parse_fails,
-        "invalid_percentage": invalid_percentage,
+        "invalid_percentage": float(invalid_percentage),
+        "future_count": future_count,
+        "monotonicity": monotonicity,
+        "has_time_component": has_time,
+        "gap_stats": gap_stats,
         "counts": {
             "years": year_counts,
             "months": month_counts,
+            "weekdays": weekday_counts,
             "days": day_counts,
+            "hours": hour_counts,
         },
     }
     return stats
diff --git a/hashprep/utils/type_inference.py b/hashprep/utils/type_inference.py
index 49676ed..6894f83 100644
--- a/hashprep/utils/type_inference.py
+++ b/hashprep/utils/type_inference.py
@@ -3,6 +3,7 @@
 from ..config import DEFAULT_CONFIG
 
 _TYPE_CFG = DEFAULT_CONFIG.type_inference
+_DT_CFG = DEFAULT_CONFIG.datetime
 CONFIG = {
     "cat_cardinality_threshold": _TYPE_CFG.cat_cardinality_threshold,
     "cat_percentage_threshold": _TYPE_CFG.cat_percentage_threshold,
@@ -11,10 +12,20 @@
 }
 
 
+def _looks_like_datetime(series: pd.Series) -> bool:
+    """Return True if an object/string column parses as datetime above the threshold."""
+    sample = series.dropna().head(200)
+    if len(sample) == 0:
+        return False
+    parsed = pd.to_datetime(sample, errors="coerce")
+    parse_ratio = parsed.notna().mean()
+    return float(parse_ratio) >= _DT_CFG.parse_threshold
+
+
 def infer_types(df: pd.DataFrame) -> dict[str, str]:
     """
     Infer semantic types per ydata logic.
-    Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'Unsupported'}
+    Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'DateTime' | 'Boolean' | 'Unsupported'}
     """
     types = {}
     for col in df.columns:
@@ -23,21 +34,31 @@ def infer_types(df: pd.DataFrame) -> dict[str, str]:
             types[col] = "Unsupported"
             continue
 
+        # DateTime: native datetime64 dtype
+        if pd.api.types.is_datetime64_any_dtype(series):
+            types[col] = "DateTime"
+
         # Numeric inference (ydata's Numeric.contains_op + numeric_is_category)
-        if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
+        elif pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
             n_unique = series.nunique()
             if 1 <= n_unique <= CONFIG["num_low_cat_threshold"]:
                 types[col] = "Categorical"  # Low-card numeric → Categorical (e.g., SibSp, Parch)
             else:
                 types[col] = "Numeric"  # High-card numeric (e.g., Age, Fare)
 
+        # Boolean dtype
+        elif pd.api.types.is_bool_dtype(series):
+            types[col] = "Categorical"
+
         # String/Text inference (ydata's Text.contains_op + string_is_category)
         elif pd.api.types.is_string_dtype(series) or pd.api.types.is_object_dtype(series):
             n_unique = series.nunique()
             unique_pct = n_unique / len(series)
-            is_bool = all(s.lower() in CONFIG["bool_mappings"] for s in series[:5])  # Quick bool check
+            is_bool = all(str(s).lower() in CONFIG["bool_mappings"] for s in series[:5])
             if is_bool:
-                types[col] = "Categorical"  # Bool-like → Categorical
+                types[col] = "Categorical"
+            elif _looks_like_datetime(series):
+                types[col] = "DateTime"  # String dates → DateTime (checked before cardinality)
             elif (
                 1 <= n_unique <= CONFIG["cat_cardinality_threshold"] and unique_pct < CONFIG["cat_percentage_threshold"]
             ):
diff --git a/tests/test_datetime.py b/tests/test_datetime.py
new file mode 100644
index 0000000..1511510
--- /dev/null
+++ b/tests/test_datetime.py
@@ -0,0 +1,290 @@
+"""Tests for DateTime support: type inference, checks, and summaries."""
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from hashprep import DatasetAnalyzer
+from hashprep.checks.datetime_checks import (
+    _check_datetime_future_dates,
+    _check_datetime_gaps,
+    _check_datetime_monotonicity,
+)
+from hashprep.summaries.variables import _summarize_datetime
+from hashprep.utils.type_inference import infer_types
+
+
+# ---------------------------------------------------------------------------
+# Type inference
+# ---------------------------------------------------------------------------
+
+
+class TestDateTimeTypeInference:
+    def test_native_datetime64_col(self):
+        df = pd.DataFrame({"ts": pd.date_range("2020-01-01", periods=50, freq="D")})
+        types = infer_types(df)
+        assert types["ts"] == "DateTime"
+
+    def test_string_iso_dates_inferred_as_datetime(self):
+        dates = [f"2021-{m:02d}-01" for m in range(1, 13)] * 5
+        df = pd.DataFrame({"date": dates})
+        types = infer_types(df)
+        assert types["date"] == "DateTime"
+
+    def test_string_datetime_with_time(self):
+        timestamps = ["2022-03-15 08:30:00", "2022-03-16 12:00:00"] * 30
+        df = pd.DataFrame({"created_at": timestamps})
+        types = infer_types(df)
+        assert types["created_at"] == "DateTime"
+
+    def test_non_date_strings_not_datetime(self):
+        df = pd.DataFrame({"name": ["Alice", "Bob", "Carol", "Dave"] * 20})
+        types = infer_types(df)
+        assert types["name"] != "DateTime"
+
+    def test_low_cardinality_numeric_still_categorical(self):
+        df = pd.DataFrame({"flag": [0, 1, 0, 1] * 25})
+        types = infer_types(df)
+        assert types["flag"] == "Categorical"
+
+    def test_mostly_unparseable_object_column(self):
+        # Only 20% parseable → should NOT be DateTime
+        values = ["2021-01-01"] * 2 + ["foo bar baz"] * 8
+        df = pd.DataFrame({"mixed": values * 10})
+        types = infer_types(df)
+        assert types["mixed"] != "DateTime"
+
+    def test_empty_col_is_unsupported(self):
+        df = pd.DataFrame({"empty": [None, None, None]})
+        types = infer_types(df)
+        assert types["empty"] == "Unsupported"
+
+
+# ---------------------------------------------------------------------------
+# _check_datetime_future_dates
+# ---------------------------------------------------------------------------
+
+
+class _FakeAnalyzer:
+    """Minimal stand-in for DatasetAnalyzer used in unit tests."""
+
+    def __init__(self, df, column_types):
+        self.df = df
+        self.column_types = column_types
+
+
+class TestFutureDatesCheck:
+    def _make(self, dates, col="date"):
+        df = pd.DataFrame({col: pd.to_datetime(dates)})
+        return _FakeAnalyzer(df, {col: "DateTime"})
+
+    def test_no_future_dates(self):
+        analyzer = self._make(["2020-01-01", "2021-06-15", "2019-12-31"])
+        issues = _check_datetime_future_dates(analyzer)
+        assert issues == []
+
+    def test_small_future_ratio_is_warning(self):
+        past = pd.date_range("2020-01-01", periods=98, freq="D").tolist()
+        future = [pd.Timestamp.now() + pd.Timedelta(days=365)] * 2  # 2% future
+        analyzer = self._make(past + future)
+        issues = _check_datetime_future_dates(analyzer)
+        assert len(issues) == 1
+        assert issues[0].severity == "warning"
+        assert issues[0].category == "datetime_future_dates"
+
+    def test_large_future_ratio_is_critical(self):
+        past = pd.date_range("2020-01-01", periods=90, freq="D").tolist()
+        future = [pd.Timestamp.now() + pd.Timedelta(days=365)] * 10  # 10% future
+        analyzer = self._make(past + future)
+        issues = _check_datetime_future_dates(analyzer)
+        assert len(issues) == 1
+        assert issues[0].severity == "critical"
+
+    def test_non_datetime_col_ignored(self):
+        df = pd.DataFrame({"score": [1.0, 2.0, 3.0]})
+        analyzer = _FakeAnalyzer(df, {"score": "Numeric"})
+        issues = _check_datetime_future_dates(analyzer)
+        assert issues == []
+
+
+# ---------------------------------------------------------------------------
+# _check_datetime_gaps
+# ---------------------------------------------------------------------------
+
+
+class TestDatetimeGapsCheck:
+    def _make(self, dates, col="ts"):
+        df = pd.DataFrame({col: pd.to_datetime(dates)})
+        return _FakeAnalyzer(df, {col: "DateTime"})
+
+    def test_regular_series_no_gap_issue(self):
+        dates = pd.date_range("2020-01-01", periods=30, freq="D")
+        analyzer = self._make(dates)
+        issues = _check_datetime_gaps(analyzer)
+        assert issues == []
+
+    def test_large_gap_raises_warning(self):
+        # 29 days regular, then jump 6 months
+        regular = pd.date_range("2020-01-01", periods=20, freq="D").tolist()
+        after_gap = pd.date_range("2020-09-01", periods=20, freq="D").tolist()
+        analyzer = self._make(regular + after_gap)
+        issues = _check_datetime_gaps(analyzer)
+        assert len(issues) == 1
+        assert issues[0].category == "datetime_gaps"
+
+    def test_too_few_rows_skipped(self):
+        dates = pd.date_range("2020-01-01", periods=5, freq="D")
+        analyzer = self._make(dates)
+        issues = _check_datetime_gaps(analyzer)
+        assert issues == []
+
+
+# ---------------------------------------------------------------------------
+# _check_datetime_monotonicity
+# ---------------------------------------------------------------------------
+
+
+class TestDatetimeMonotonicityCheck:
+    def _make(self, dates, col="ts"):
+        df = pd.DataFrame({col: pd.to_datetime(dates)})
+        return _FakeAnalyzer(df, {col: "DateTime"})
+
+    def test_monotonic_increasing_no_issue(self):
+        dates = pd.date_range("2020-01-01", periods=50, freq="D")
+        analyzer = self._make(dates)
+        issues = _check_datetime_monotonicity(analyzer)
+        assert issues == []
+
+    def test_non_monotonic_raises_warning(self):
+        dates = pd.date_range("2020-01-01", periods=50, freq="D").tolist()
+        # Shuffle to break monotonicity
+        dates[10], dates[20] = dates[20], dates[10]
+        dates[30], dates[40] = dates[40], dates[30]
+        analyzer = self._make(dates)
+        issues = _check_datetime_monotonicity(analyzer)
+        assert len(issues) == 1
+        assert issues[0].category == "datetime_monotonicity"
+        assert issues[0].severity == "warning"
+
+    def test_low_unique_ratio_skipped(self):
+        # Many duplicate timestamps → not treated as a time-series index
+        dates = ["2020-01-01"] * 50
+        analyzer = self._make(dates)
+        issues = _check_datetime_monotonicity(analyzer)
+        assert issues == []
+
+
+# ---------------------------------------------------------------------------
+# _summarize_datetime
+# ---------------------------------------------------------------------------
+
+
+class TestSummarizeDatetime:
+    def _df(self, col, values):
+        return pd.DataFrame({col: values})
+
+    def test_basic_fields_present(self):
+        df = self._df("dt", pd.date_range("2020-01-01", periods=30, freq="D"))
+        result = _summarize_datetime(df, "dt")
+        for key in ("minimum", "maximum", "range_days", "counts", "gap_stats", "monotonicity", "future_count"):
+            assert key in result, f"Missing key: {key}"
+
+    def test_monotonicity_increasing(self):
+        df = self._df("dt", pd.date_range("2021-01-01", periods=20, freq="D"))
+        result = _summarize_datetime(df, "dt")
+        assert result["monotonicity"] == "increasing"
+
+    def test_counts_contain_weekdays(self):
+        df = self._df("dt", pd.date_range("2020-01-01", periods=30, freq="D"))
+        result = _summarize_datetime(df, "dt")
+        assert "weekdays" in result["counts"]
+
+    def test_string_dates_parsed(self):
+        df = self._df("date", ["2021-01-01", "2021-06-15", "2022-03-20"])
+        result = _summarize_datetime(df, "date")
+        assert result["minimum"] is not None
+        assert result["range_days"] > 0
+
+    def test_all_missing_returns_none_fields(self):
+        df = self._df("dt", [None, None, None])
+        result = _summarize_datetime(df, "dt")
+        assert result["minimum"] is None
+        assert result["counts"] is None
+
+    def test_gap_stats_present_for_regular_series(self):
+        df = self._df("dt", pd.date_range("2020-01-01", periods=20, freq="D"))
+        result = _summarize_datetime(df, "dt")
+        assert result["gap_stats"] is not None
+        assert result["gap_stats"]["median_gap_seconds"] > 0
+
+    def test_has_time_component_false_for_date_only(self):
+        df = self._df("dt", pd.date_range("2020-01-01", periods=10, freq="D"))
+        result = _summarize_datetime(df, "dt")
+        assert result["has_time_component"] is False
+
+    def test_has_time_component_true_for_timestamps(self):
+        timestamps = pd.date_range("2020-01-01 08:00", periods=10, freq="h")
+        df = self._df("ts", timestamps)
+        result = _summarize_datetime(df, "ts")
+        assert result["has_time_component"] is True
+        assert result["counts"]["hours"] is not None
+
+
+# ---------------------------------------------------------------------------
+# Integration: DatasetAnalyzer picks up DateTime columns end-to-end
+# ---------------------------------------------------------------------------
+
+
+class TestDateTimeIntegration:
+    def test_datetime_column_typed_correctly(self):
+        df = pd.DataFrame(
+            {
+                "date": pd.date_range("2020-01-01", periods=50, freq="D"),
+                "value": np.random.default_rng(0).standard_normal(50),
+            }
+        )
+        analyzer = DatasetAnalyzer(df, auto_sample=False)
+        summary = analyzer.analyze()
+        assert summary["column_types"]["date"] == "DateTime"
+
+    def test_string_date_column_typed_correctly(self):
+        df = pd.DataFrame(
+            {
+                "created": [f"2021-{m:02d}-15" for m in range(1, 13)] * 5,
+                "amount": np.random.default_rng(1).standard_normal(60),
+            }
+        )
+        analyzer = DatasetAnalyzer(df, auto_sample=False)
+        summary = analyzer.analyze()
+        assert summary["column_types"]["created"] == "DateTime"
+
+    def test_datetime_summary_in_variables(self):
+        df = pd.DataFrame({"ts": pd.date_range("2022-01-01", periods=30, freq="D")})
+        analyzer = DatasetAnalyzer(df, auto_sample=False)
+        summary = analyzer.analyze()
+        var = summary["summaries"]["variables"]["ts"]
+        assert var["category"] == "DateTime"
+        assert var["minimum"] is not None
+        assert var["counts"]["weekdays"] is not None
+
+    def test_future_dates_issue_detected(self):
+        past = pd.date_range("2020-01-01", periods=90, freq="D").tolist()
+        future = [pd.Timestamp.now() + pd.Timedelta(days=400)] * 10
+        df = pd.DataFrame({"date": past + future})
+        analyzer = DatasetAnalyzer(df, selected_checks=["datetime_future_dates"], auto_sample=False)
+        summary = analyzer.analyze()
+        categories = [i["category"] for i in summary["issues"]]
+        assert "datetime_future_dates" in categories
+
+    def test_gap_issue_detected(self):
+        regular = pd.date_range("2020-01-01", periods=20, freq="D").tolist()
+        after = pd.date_range("2021-06-01", periods=20, freq="D").tolist()
+        df = pd.DataFrame({"ts": regular + after, "val": range(40)})
+        analyzer = DatasetAnalyzer(df, selected_checks=["datetime_gaps"], auto_sample=False)
+        summary = analyzer.analyze()
+        categories = [i["category"] for i in summary["issues"]]
+        assert "datetime_gaps" in categories
+
+    def test_new_checks_in_all_checks(self):
+        for check in ("datetime_future_dates", "datetime_gaps", "datetime_monotonicity"):
+            assert check in DatasetAnalyzer.ALL_CHECKS

From 01b05bd3f64cf63000125c26e056b69727a2137c Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Mon, 2 Mar 2026 22:45:19 +0530
Subject: [PATCH 2/2] fix: remove unused pytest import and sort imports in
 test_datetime

---
 tests/test_datetime.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_datetime.py b/tests/test_datetime.py
index 1511510..67c6c73 100644
--- a/tests/test_datetime.py
+++ b/tests/test_datetime.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import pandas as pd
-import pytest
 
 from hashprep import DatasetAnalyzer
 from hashprep.checks.datetime_checks import (
@@ -13,7 +12,6 @@
 from hashprep.summaries.variables import _summarize_datetime
 from hashprep.utils.type_inference import infer_types
 
-
 # ---------------------------------------------------------------------------
 # Type inference
 # ---------------------------------------------------------------------------