From cf8aeea2c6834fd4746a12f4a20f1545ec9df595 Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Mon, 2 Mar 2026 22:40:12 +0530 Subject: [PATCH 1/2] feat: add first-class DateTime support to type inference, checks, and summaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - type_inference.py: detect datetime64 dtypes and parse object columns (>80% parseable via pd.to_datetime → classified as "DateTime"); datetime detection runs before cardinality-based categorical check so date columns with few unique values are not misclassified - config.py: add DateTimeThresholds (parse threshold, future-date ratio, gap multipliers, min rows) wired into HashPrepConfig - checks/datetime_checks.py: three new checks — - datetime_future_dates: flags values timestamped in the future - datetime_gaps: detects anomalously large gaps vs. median gap - datetime_monotonicity: warns when a high-uniqueness date column is out of temporal order - summaries/variables.py: enhance _summarize_datetime with weekday distribution, hour distribution (sub-day precision), gap statistics, monotonicity, future_count, and has_time_component flag; removed deprecated infer_datetime_format argument across all callers - checks/__init__.py + core/analyzer.py: register three new checks in CHECKS registry and ALL_CHECKS list - tests/test_datetime.py: 31 tests covering type inference, each check unit, summary fields, and end-to-end DatasetAnalyzer integration --- hashprep/checks/__init__.py | 4 + hashprep/checks/datetime_checks.py | 153 +++++++++++++++ hashprep/config.py | 16 ++ hashprep/core/analyzer.py | 3 + hashprep/summaries/variables.py | 56 +++++- hashprep/utils/type_inference.py | 29 ++- tests/test_datetime.py | 290 +++++++++++++++++++++++++++++ 7 files changed, 541 insertions(+), 10 deletions(-) create mode 100644 hashprep/checks/datetime_checks.py create mode 100644 tests/test_datetime.py diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py index f44230f..ccd8379 100644 --- a/hashprep/checks/__init__.py +++ b/hashprep/checks/__init__.py @@ -1,6 +1,7 @@ from .columns import _check_duplicates, _check_high_cardinality, _check_mixed_data_types, _check_single_value_columns from .core import Issue as Issue from .correlations import calculate_correlations +from .datetime_checks import _check_datetime_future_dates, _check_datetime_gaps, _check_datetime_monotonicity from .distribution import _check_uniform_distribution, _check_unique_values from .drift import check_drift from .imbalance import _check_class_imbalance @@ -45,6 +46,9 @@ def _check_dataset_drift(analyzer): "high_zero_counts": _check_high_zero_counts, "extreme_text_lengths": _check_extreme_text_lengths, "datetime_skew": _check_datetime_skew, + "datetime_future_dates": _check_datetime_future_dates, + "datetime_gaps": _check_datetime_gaps, + "datetime_monotonicity": _check_datetime_monotonicity, "missing_patterns": _check_missing_patterns, "skewness": _check_skewness, "dataset_drift": _check_dataset_drift, diff --git a/hashprep/checks/datetime_checks.py b/hashprep/checks/datetime_checks.py new file mode 100644 index 0000000..6c5abfd --- /dev/null +++ b/hashprep/checks/datetime_checks.py @@ -0,0 +1,153 @@ +import numpy as np +import pandas as pd + +from ..config import DEFAULT_CONFIG +from .core import Issue + +_DT_CFG = DEFAULT_CONFIG.datetime + + +def _coerce_datetime(series: pd.Series) -> pd.Series: + """Return a datetime Series regardless of whether the source is datetime64 or object.""" + if pd.api.types.is_datetime64_any_dtype(series): + return series.dropna() + return pd.to_datetime(series, errors="coerce").dropna() + + +def _datetime_cols(analyzer) -> list[str]: + """Return columns inferred as DateTime.""" + return [col for col, typ in analyzer.column_types.items() if typ == "DateTime"] + + +def _check_datetime_future_dates(analyzer) -> list[Issue]: + """Flag datetime columns that contain values in the future (likely data errors).""" + issues = [] + now = pd.Timestamp.now() + + for col in _datetime_cols(analyzer): + dt = _coerce_datetime(analyzer.df[col]) + if dt.empty: + continue + + future_count = int((dt > now).sum()) + if future_count == 0: + continue + + future_ratio = future_count / len(dt) + severity = "critical" if future_ratio > _DT_CFG.future_date_critical_ratio else "warning" + impact = "high" if severity == "critical" else "medium" + issues.append( + Issue( + category="datetime_future_dates", + severity=severity, + column=col, + description=( + f"Column '{col}' has {future_count} future-dated values " + f"({future_ratio:.1%} of non-missing) — latest: {dt.max().date()}" + ), + impact_score=impact, + quick_fix=( + "Options:\n" + "- Investigate source: Future dates often indicate data entry errors or clock skew.\n" + "- Cap to present: Replace future dates with today or NaN.\n" + "- Exclude rows: Drop records with future timestamps before training." + ), + ) + ) + return issues + + +def _check_datetime_gaps(analyzer) -> list[Issue]: + """Detect anomalously large gaps in datetime columns (broken time series).""" + issues = [] + + for col in _datetime_cols(analyzer): + dt = _coerce_datetime(analyzer.df[col]).sort_values() + if len(dt) < _DT_CFG.min_rows_for_gap_check: + continue + + diffs = dt.diff().dropna() + if diffs.empty: + continue + + # Work in total seconds for a unit-agnostic comparison + diff_seconds = diffs.dt.total_seconds() + median_gap = float(diff_seconds.median()) + if median_gap <= 0: + continue + + max_gap = float(diff_seconds.max()) + ratio = max_gap / median_gap + + if ratio >= _DT_CFG.gap_multiplier_warning: + severity = "critical" if ratio >= _DT_CFG.gap_multiplier_critical else "warning" + impact = "high" if severity == "critical" else "medium" + + # Locate the gap for a human-readable description + gap_idx = int(np.argmax(diff_seconds.values)) + gap_start = dt.iloc[gap_idx] + gap_end = dt.iloc[gap_idx + 1] + gap_days = (gap_end - gap_start).days + + issues.append( + Issue( + category="datetime_gaps", + severity=severity, + column=col, + description=( + f"Column '{col}' has an anomalous gap of {gap_days} days " + f"({ratio:.0f}× the median gap) between {gap_start.date()} and {gap_end.date()}" + ), + impact_score=impact, + quick_fix=( + "Options:\n" + "- Investigate gap: May indicate missing data collection periods.\n" + "- Impute missing periods: Forward-fill or interpolate for time-series models.\n" + "- Flag as a feature: Create a binary 'gap_present' indicator.\n" + "- Segment model: Train separate models for each contiguous period." + ), + ) + ) + return issues + + +def _check_datetime_monotonicity(analyzer) -> list[Issue]: + """Warn when a datetime column that looks like a time-series index is non-monotonic.""" + issues = [] + + for col in _datetime_cols(analyzer): + dt = _coerce_datetime(analyzer.df[col]) + if len(dt) < _DT_CFG.min_rows_for_gap_check: + continue + + # Only flag if the column has mostly unique values (i.e., likely an index/timestamp) + unique_ratio = dt.nunique() / len(dt) + if unique_ratio < 0.9: + continue + + if not (dt.is_monotonic_increasing or dt.is_monotonic_decreasing): + # Count out-of-order entries + sorted_dt = dt.sort_values() + out_of_order = int((dt.reset_index(drop=True) != sorted_dt.reset_index(drop=True)).sum()) + out_ratio = out_of_order / len(dt) + severity = "warning" + impact = "medium" + issues.append( + Issue( + category="datetime_monotonicity", + severity=severity, + column=col, + description=( + f"Column '{col}' is non-monotonic: {out_of_order} rows " + f"({out_ratio:.1%}) are out of temporal order" + ), + impact_score=impact, + quick_fix=( + "Options:\n" + "- Sort by this column: Restores temporal order before time-series modeling.\n" + "- Investigate duplicates: Non-monotonicity may reveal duplicate or misaligned records.\n" + "- Retain if intentional: Some datasets (e.g., event logs) are legitimately unordered." + ), + ) + ) + return issues diff --git a/hashprep/config.py b/hashprep/config.py index d62d3d9..5c97a4f 100644 --- a/hashprep/config.py +++ b/hashprep/config.py @@ -126,6 +126,21 @@ class ImbalanceThresholds: majority_class_ratio: float = 0.9 +@dataclass(frozen=True) +class DateTimeThresholds: + """Thresholds for datetime-specific checks.""" + + # Ratio of parseable values to classify an object column as DateTime + parse_threshold: float = 0.8 + # Any future-dated values trigger a warning (ratio > 0 → warn, ratio > this → critical) + future_date_critical_ratio: float = 0.05 + # A gap is anomalous if it exceeds this multiple of the median gap + gap_multiplier_warning: float = 5.0 + gap_multiplier_critical: float = 20.0 + # Minimum number of rows needed to run gap/monotonicity checks + min_rows_for_gap_check: int = 10 + + @dataclass(frozen=True) class TypeInferenceConfig: """Configuration for type inference.""" @@ -175,6 +190,7 @@ class HashPrepConfig: drift: DriftThresholds = field(default_factory=DriftThresholds) distribution: DistributionThresholds = field(default_factory=DistributionThresholds) imbalance: ImbalanceThresholds = field(default_factory=ImbalanceThresholds) + datetime: DateTimeThresholds = field(default_factory=DateTimeThresholds) type_inference: TypeInferenceConfig = field(default_factory=TypeInferenceConfig) sampling: SamplingDefaults = field(default_factory=SamplingDefaults) summaries: SummaryDefaults = field(default_factory=SummaryDefaults) diff --git a/hashprep/core/analyzer.py b/hashprep/core/analyzer.py index 31e4161..ab696c8 100644 --- a/hashprep/core/analyzer.py +++ b/hashprep/core/analyzer.py @@ -49,6 +49,9 @@ class DatasetAnalyzer: "high_zero_counts", "extreme_text_lengths", "datetime_skew", + "datetime_future_dates", + "datetime_gaps", + "datetime_monotonicity", "missing_patterns", "skewness", "dataset_drift", diff --git a/hashprep/summaries/variables.py b/hashprep/summaries/variables.py index 8346a74..a23ef07 100644 --- a/hashprep/summaries/variables.py +++ b/hashprep/summaries/variables.py @@ -324,9 +324,14 @@ def _summarize_categorical(df, col): def _summarize_datetime(df, col): - dt_series = pd.to_datetime(df[col], errors="coerce") + if pd.api.types.is_datetime64_any_dtype(df[col]): + dt_series = df[col] + parse_fails = 0 + else: + dt_series = pd.to_datetime(df[col], errors="coerce") + parse_fails = int((dt_series.isna() & df[col].notna()).sum()) + valid_series = dt_series.dropna() - parse_fails = int((dt_series.isna() & df[col].notna()).sum()) invalid_percentage = (parse_fails / len(df) * 100) if len(df) > 0 else 0.0 if valid_series.empty: @@ -337,15 +342,48 @@ def _summarize_datetime(df, col): "invalid_count": parse_fails, "invalid_percentage": invalid_percentage, "counts": None, + "gap_stats": None, + "monotonicity": None, + "future_count": None, } min_dt = valid_series.min() max_dt = valid_series.max() range_delta = max_dt - min_dt + now = pd.Timestamp.now() + + year_counts = {int(k): int(v) for k, v in valid_series.dt.year.value_counts().items()} + month_counts = {int(k): int(v) for k, v in valid_series.dt.month.value_counts().items()} + weekday_counts = {int(k): int(v) for k, v in valid_series.dt.dayofweek.value_counts().items()} + day_counts = {int(k): int(v) for k, v in valid_series.dt.day.value_counts().items()} + + # Sub-day precision: include hour distribution if values have non-zero hours + has_time = bool((valid_series.dt.hour != 0).any() or (valid_series.dt.minute != 0).any()) + hour_counts = {int(k): int(v) for k, v in valid_series.dt.hour.value_counts().items()} if has_time else None + + # Gap statistics (sorted diffs) + sorted_series = valid_series.sort_values() + diffs = sorted_series.diff().dropna() + gap_stats = None + if len(diffs) > 0: + diff_seconds = diffs.dt.total_seconds() + gap_stats = { + "median_gap_seconds": float(diff_seconds.median()), + "max_gap_seconds": float(diff_seconds.max()), + "min_gap_seconds": float(diff_seconds.min()), + "mean_gap_seconds": float(diff_seconds.mean()), + } + + # Monotonicity + if valid_series.is_monotonic_increasing: + monotonicity = "increasing" + elif valid_series.is_monotonic_decreasing: + monotonicity = "decreasing" + else: + monotonicity = "non-monotonic" - year_counts = valid_series.dt.year.value_counts().to_dict() - month_counts = valid_series.dt.month.value_counts().to_dict() - day_counts = valid_series.dt.day.value_counts().to_dict() + # Future dates + future_count = int((valid_series > now).sum()) stats = { "minimum": str(min_dt), @@ -353,11 +391,17 @@ def _summarize_datetime(df, col): "range_days": int(range_delta.days), "range_str": str(range_delta), "invalid_count": parse_fails, - "invalid_percentage": invalid_percentage, + "invalid_percentage": float(invalid_percentage), + "future_count": future_count, + "monotonicity": monotonicity, + "has_time_component": has_time, + "gap_stats": gap_stats, "counts": { "years": year_counts, "months": month_counts, + "weekdays": weekday_counts, "days": day_counts, + "hours": hour_counts, }, } return stats diff --git a/hashprep/utils/type_inference.py b/hashprep/utils/type_inference.py index 49676ed..6894f83 100644 --- a/hashprep/utils/type_inference.py +++ b/hashprep/utils/type_inference.py @@ -3,6 +3,7 @@ from ..config import DEFAULT_CONFIG _TYPE_CFG = DEFAULT_CONFIG.type_inference +_DT_CFG = DEFAULT_CONFIG.datetime CONFIG = { "cat_cardinality_threshold": _TYPE_CFG.cat_cardinality_threshold, "cat_percentage_threshold": _TYPE_CFG.cat_percentage_threshold, @@ -11,10 +12,20 @@ } +def _looks_like_datetime(series: pd.Series) -> bool: + """Return True if an object/string column parses as datetime above the threshold.""" + sample = series.dropna().head(200) + if len(sample) == 0: + return False + parsed = pd.to_datetime(sample, errors="coerce") + parse_ratio = parsed.notna().mean() + return float(parse_ratio) >= _DT_CFG.parse_threshold + + def infer_types(df: pd.DataFrame) -> dict[str, str]: """ Infer semantic types per ydata logic. - Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'Unsupported'} + Returns: {col: 'Numeric' | 'Categorical' | 'Text' | 'DateTime' | 'Boolean' | 'Unsupported'} """ types = {} for col in df.columns: @@ -23,21 +34,31 @@ def infer_types(df: pd.DataFrame) -> dict[str, str]: types[col] = "Unsupported" continue + # DateTime: native datetime64 dtype + if pd.api.types.is_datetime64_any_dtype(series): + types[col] = "DateTime" + # Numeric inference (ydata's Numeric.contains_op + numeric_is_category) - if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series): + elif pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series): n_unique = series.nunique() if 1 <= n_unique <= CONFIG["num_low_cat_threshold"]: types[col] = "Categorical" # Low-card numeric → Categorical (e.g., SibSp, Parch) else: types[col] = "Numeric" # High-card numeric (e.g., Age, Fare) + # Boolean dtype + elif pd.api.types.is_bool_dtype(series): + types[col] = "Categorical" + # String/Text inference (ydata's Text.contains_op + string_is_category) elif pd.api.types.is_string_dtype(series) or pd.api.types.is_object_dtype(series): n_unique = series.nunique() unique_pct = n_unique / len(series) - is_bool = all(s.lower() in CONFIG["bool_mappings"] for s in series[:5]) # Quick bool check + is_bool = all(str(s).lower() in CONFIG["bool_mappings"] for s in series[:5]) if is_bool: - types[col] = "Categorical" # Bool-like → Categorical + types[col] = "Categorical" + elif _looks_like_datetime(series): + types[col] = "DateTime" # String dates → DateTime (checked before cardinality) elif ( 1 <= n_unique <= CONFIG["cat_cardinality_threshold"] and unique_pct < CONFIG["cat_percentage_threshold"] ): diff --git a/tests/test_datetime.py b/tests/test_datetime.py new file mode 100644 index 0000000..1511510 --- /dev/null +++ b/tests/test_datetime.py @@ -0,0 +1,290 @@ +"""Tests for DateTime support: type inference, checks, and summaries.""" + +import numpy as np +import pandas as pd +import pytest + +from hashprep import DatasetAnalyzer +from hashprep.checks.datetime_checks import ( + _check_datetime_future_dates, + _check_datetime_gaps, + _check_datetime_monotonicity, +) +from hashprep.summaries.variables import _summarize_datetime +from hashprep.utils.type_inference import infer_types + + +# --------------------------------------------------------------------------- +# Type inference +# --------------------------------------------------------------------------- + + +class TestDateTimeTypeInference: + def test_native_datetime64_col(self): + df = pd.DataFrame({"ts": pd.date_range("2020-01-01", periods=50, freq="D")}) + types = infer_types(df) + assert types["ts"] == "DateTime" + + def test_string_iso_dates_inferred_as_datetime(self): + dates = [f"2021-{m:02d}-01" for m in range(1, 13)] * 5 + df = pd.DataFrame({"date": dates}) + types = infer_types(df) + assert types["date"] == "DateTime" + + def test_string_datetime_with_time(self): + timestamps = ["2022-03-15 08:30:00", "2022-03-16 12:00:00"] * 30 + df = pd.DataFrame({"created_at": timestamps}) + types = infer_types(df) + assert types["created_at"] == "DateTime" + + def test_non_date_strings_not_datetime(self): + df = pd.DataFrame({"name": ["Alice", "Bob", "Carol", "Dave"] * 20}) + types = infer_types(df) + assert types["name"] != "DateTime" + + def test_low_cardinality_numeric_still_categorical(self): + df = pd.DataFrame({"flag": [0, 1, 0, 1] * 25}) + types = infer_types(df) + assert types["flag"] == "Categorical" + + def test_mostly_unparseable_object_column(self): + # Only 20% parseable → should NOT be DateTime + values = ["2021-01-01"] * 2 + ["foo bar baz"] * 8 + df = pd.DataFrame({"mixed": values * 10}) + types = infer_types(df) + assert types["mixed"] != "DateTime" + + def test_empty_col_is_unsupported(self): + df = pd.DataFrame({"empty": [None, None, None]}) + types = infer_types(df) + assert types["empty"] == "Unsupported" + + +# --------------------------------------------------------------------------- +# _check_datetime_future_dates +# --------------------------------------------------------------------------- + + +class _FakeAnalyzer: + """Minimal stand-in for DatasetAnalyzer used in unit tests.""" + + def __init__(self, df, column_types): + self.df = df + self.column_types = column_types + + +class TestFutureDatesCheck: + def _make(self, dates, col="date"): + df = pd.DataFrame({col: pd.to_datetime(dates)}) + return _FakeAnalyzer(df, {col: "DateTime"}) + + def test_no_future_dates(self): + analyzer = self._make(["2020-01-01", "2021-06-15", "2019-12-31"]) + issues = _check_datetime_future_dates(analyzer) + assert issues == [] + + def test_small_future_ratio_is_warning(self): + past = pd.date_range("2020-01-01", periods=98, freq="D").tolist() + future = [pd.Timestamp.now() + pd.Timedelta(days=365)] * 2 # 2% future + analyzer = self._make(past + future) + issues = _check_datetime_future_dates(analyzer) + assert len(issues) == 1 + assert issues[0].severity == "warning" + assert issues[0].category == "datetime_future_dates" + + def test_large_future_ratio_is_critical(self): + past = pd.date_range("2020-01-01", periods=90, freq="D").tolist() + future = [pd.Timestamp.now() + pd.Timedelta(days=365)] * 10 # 10% future + analyzer = self._make(past + future) + issues = _check_datetime_future_dates(analyzer) + assert len(issues) == 1 + assert issues[0].severity == "critical" + + def test_non_datetime_col_ignored(self): + df = pd.DataFrame({"score": [1.0, 2.0, 3.0]}) + analyzer = _FakeAnalyzer(df, {"score": "Numeric"}) + issues = _check_datetime_future_dates(analyzer) + assert issues == [] + + +# --------------------------------------------------------------------------- +# _check_datetime_gaps +# --------------------------------------------------------------------------- + + +class TestDatetimeGapsCheck: + def _make(self, dates, col="ts"): + df = pd.DataFrame({col: pd.to_datetime(dates)}) + return _FakeAnalyzer(df, {col: "DateTime"}) + + def test_regular_series_no_gap_issue(self): + dates = pd.date_range("2020-01-01", periods=30, freq="D") + analyzer = self._make(dates) + issues = _check_datetime_gaps(analyzer) + assert issues == [] + + def test_large_gap_raises_warning(self): + # 29 days regular, then jump 6 months + regular = pd.date_range("2020-01-01", periods=20, freq="D").tolist() + after_gap = pd.date_range("2020-09-01", periods=20, freq="D").tolist() + analyzer = self._make(regular + after_gap) + issues = _check_datetime_gaps(analyzer) + assert len(issues) == 1 + assert issues[0].category == "datetime_gaps" + + def test_too_few_rows_skipped(self): + dates = pd.date_range("2020-01-01", periods=5, freq="D") + analyzer = self._make(dates) + issues = _check_datetime_gaps(analyzer) + assert issues == [] + + +# --------------------------------------------------------------------------- +# _check_datetime_monotonicity +# --------------------------------------------------------------------------- + + +class TestDatetimeMonotonicityCheck: + def _make(self, dates, col="ts"): + df = pd.DataFrame({col: pd.to_datetime(dates)}) + return _FakeAnalyzer(df, {col: "DateTime"}) + + def test_monotonic_increasing_no_issue(self): + dates = pd.date_range("2020-01-01", periods=50, freq="D") + analyzer = self._make(dates) + issues = _check_datetime_monotonicity(analyzer) + assert issues == [] + + def test_non_monotonic_raises_warning(self): + dates = pd.date_range("2020-01-01", periods=50, freq="D").tolist() + # Shuffle to break monotonicity + dates[10], dates[20] = dates[20], dates[10] + dates[30], dates[40] = dates[40], dates[30] + analyzer = self._make(dates) + issues = _check_datetime_monotonicity(analyzer) + assert len(issues) == 1 + assert issues[0].category == "datetime_monotonicity" + assert issues[0].severity == "warning" + + def test_low_unique_ratio_skipped(self): + # Many duplicate timestamps → not treated as a time-series index + dates = ["2020-01-01"] * 50 + analyzer = self._make(dates) + issues = _check_datetime_monotonicity(analyzer) + assert issues == [] + + +# --------------------------------------------------------------------------- +# _summarize_datetime +# --------------------------------------------------------------------------- + + +class TestSummarizeDatetime: + def _df(self, col, values): + return pd.DataFrame({col: values}) + + def test_basic_fields_present(self): + df = self._df("dt", pd.date_range("2020-01-01", periods=30, freq="D")) + result = _summarize_datetime(df, "dt") + for key in ("minimum", "maximum", "range_days", "counts", "gap_stats", "monotonicity", "future_count"): + assert key in result, f"Missing key: {key}" + + def test_monotonicity_increasing(self): + df = self._df("dt", pd.date_range("2021-01-01", periods=20, freq="D")) + result = _summarize_datetime(df, "dt") + assert result["monotonicity"] == "increasing" + + def test_counts_contain_weekdays(self): + df = self._df("dt", pd.date_range("2020-01-01", periods=30, freq="D")) + result = _summarize_datetime(df, "dt") + assert "weekdays" in result["counts"] + + def test_string_dates_parsed(self): + df = self._df("date", ["2021-01-01", "2021-06-15", "2022-03-20"]) + result = _summarize_datetime(df, "date") + assert result["minimum"] is not None + assert result["range_days"] > 0 + + def test_all_missing_returns_none_fields(self): + df = self._df("dt", [None, None, None]) + result = _summarize_datetime(df, "dt") + assert result["minimum"] is None + assert result["counts"] is None + + def test_gap_stats_present_for_regular_series(self): + df = self._df("dt", pd.date_range("2020-01-01", periods=20, freq="D")) + result = _summarize_datetime(df, "dt") + assert result["gap_stats"] is not None + assert result["gap_stats"]["median_gap_seconds"] > 0 + + def test_has_time_component_false_for_date_only(self): + df = self._df("dt", pd.date_range("2020-01-01", periods=10, freq="D")) + result = _summarize_datetime(df, "dt") + assert result["has_time_component"] is False + + def test_has_time_component_true_for_timestamps(self): + timestamps = pd.date_range("2020-01-01 08:00", periods=10, freq="h") + df = self._df("ts", timestamps) + result = _summarize_datetime(df, "ts") + assert result["has_time_component"] is True + assert result["counts"]["hours"] is not None + + +# --------------------------------------------------------------------------- +# Integration: DatasetAnalyzer picks up DateTime columns end-to-end +# --------------------------------------------------------------------------- + + +class TestDateTimeIntegration: + def test_datetime_column_typed_correctly(self): + df = pd.DataFrame( + { + "date": pd.date_range("2020-01-01", periods=50, freq="D"), + "value": np.random.default_rng(0).standard_normal(50), + } + ) + analyzer = DatasetAnalyzer(df, auto_sample=False) + summary = analyzer.analyze() + assert summary["column_types"]["date"] == "DateTime" + + def test_string_date_column_typed_correctly(self): + df = pd.DataFrame( + { + "created": [f"2021-{m:02d}-15" for m in range(1, 13)] * 5, + "amount": np.random.default_rng(1).standard_normal(60), + } + ) + analyzer = DatasetAnalyzer(df, auto_sample=False) + summary = analyzer.analyze() + assert summary["column_types"]["created"] == "DateTime" + + def test_datetime_summary_in_variables(self): + df = pd.DataFrame({"ts": pd.date_range("2022-01-01", periods=30, freq="D")}) + analyzer = DatasetAnalyzer(df, auto_sample=False) + summary = analyzer.analyze() + var = summary["summaries"]["variables"]["ts"] + assert var["category"] == "DateTime" + assert var["minimum"] is not None + assert var["counts"]["weekdays"] is not None + + def test_future_dates_issue_detected(self): + past = pd.date_range("2020-01-01", periods=90, freq="D").tolist() + future = [pd.Timestamp.now() + pd.Timedelta(days=400)] * 10 + df = pd.DataFrame({"date": past + future}) + analyzer = DatasetAnalyzer(df, selected_checks=["datetime_future_dates"], auto_sample=False) + summary = analyzer.analyze() + categories = [i["category"] for i in summary["issues"]] + assert "datetime_future_dates" in categories + + def test_gap_issue_detected(self): + regular = pd.date_range("2020-01-01", periods=20, freq="D").tolist() + after = pd.date_range("2021-06-01", periods=20, freq="D").tolist() + df = pd.DataFrame({"ts": regular + after, "val": range(40)}) + analyzer = DatasetAnalyzer(df, selected_checks=["datetime_gaps"], auto_sample=False) + summary = analyzer.analyze() + categories = [i["category"] for i in summary["issues"]] + assert "datetime_gaps" in categories + + def test_new_checks_in_all_checks(self): + for check in ("datetime_future_dates", "datetime_gaps", "datetime_monotonicity"): + assert check in DatasetAnalyzer.ALL_CHECKS From 01b05bd3f64cf63000125c26e056b69727a2137c Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Mon, 2 Mar 2026 22:45:19 +0530 Subject: [PATCH 2/2] fix: remove unused pytest import and sort imports in test_datetime --- tests/test_datetime.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_datetime.py b/tests/test_datetime.py index 1511510..67c6c73 100644 --- a/tests/test_datetime.py +++ b/tests/test_datetime.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd -import pytest from hashprep import DatasetAnalyzer from hashprep.checks.datetime_checks import ( @@ -13,7 +12,6 @@ from hashprep.summaries.variables import _summarize_datetime from hashprep.utils.type_inference import infer_types - # --------------------------------------------------------------------------- # Type inference # ---------------------------------------------------------------------------