68 lines
1.5 KiB
Python
68 lines
1.5 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Iterable
|
|
|
|
import pandas as pd
|
|
|
|
|
|
LABEL_COLUMNS = {"label_win", "label_stop_loss"}
|
|
|
|
EXCLUDED_COLUMNS = {
|
|
"id",
|
|
"trade_id",
|
|
"date",
|
|
"ticker",
|
|
"name",
|
|
"entry_time",
|
|
"exit_time",
|
|
"exit_price",
|
|
"sample_time",
|
|
"created_at",
|
|
"exit_reason",
|
|
"strategy",
|
|
"reason",
|
|
"pnl",
|
|
"source_file",
|
|
"ai_win_score",
|
|
"ai_stop_loss_score",
|
|
"ai_model_version",
|
|
}
|
|
EXCLUDED_PREFIXES = (
|
|
"price_",
|
|
"ret_",
|
|
"mfe_",
|
|
"mae_",
|
|
)
|
|
|
|
|
|
def select_feature_columns(df: pd.DataFrame, targets: Iterable[str] = LABEL_COLUMNS) -> list[str]:
|
|
excluded = EXCLUDED_COLUMNS | set(targets)
|
|
numeric_columns = [
|
|
column
|
|
for column in df.columns
|
|
if (
|
|
column not in excluded
|
|
and not column.startswith(EXCLUDED_PREFIXES)
|
|
and pd.api.types.is_numeric_dtype(df[column])
|
|
)
|
|
]
|
|
return sorted(numeric_columns)
|
|
|
|
|
|
def build_feature_matrix(
|
|
df: pd.DataFrame,
|
|
feature_columns: list[str],
|
|
medians: dict[str, float] | None = None,
|
|
) -> tuple[pd.DataFrame, dict[str, float]]:
|
|
features = df.reindex(columns=feature_columns)
|
|
features = features.apply(pd.to_numeric, errors="coerce")
|
|
|
|
if medians is None:
|
|
medians = {
|
|
column: float(value) if pd.notna(value) else 0.0
|
|
for column, value in features.median(numeric_only=True).items()
|
|
}
|
|
|
|
features = features.fillna(medians).fillna(0.0)
|
|
return features, medians
|