2026-02-20 20:57:25 +08:00
|
|
|
|
"""
|
|
|
|
|
|
方案B:AI模型训练 + 信号生成
|
|
|
|
|
|
使用 LightGBM / XGBoost,Walk-Forward 滚动训练
|
|
|
|
|
|
"""
|
|
|
|
|
|
import json
|
|
|
|
|
|
import joblib
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from loguru import logger
|
2026-02-21 15:20:33 +08:00
|
|
|
|
from sklearn.utils.class_weight import compute_sample_weight
|
2026-02-21 15:38:23 +08:00
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
2026-02-21 15:38:23 +08:00
|
|
|
|
from .config import MODEL_CONFIG as MC, FEATURE_CONFIG as FC, PRIMARY_PERIOD, PROJECT_ROOT
|
2026-02-20 20:57:25 +08:00
|
|
|
|
from .feature_engine import prepare_dataset, get_latest_feature_row
|
|
|
|
|
|
from .backtest import BacktestEngine, print_metrics
|
|
|
|
|
|
|
|
|
|
|
|
SCHEME_B_MODEL_DIR = PROJECT_ROOT / 'models'
|
|
|
|
|
|
SCHEME_B_MODEL_FILE = SCHEME_B_MODEL_DIR / 'scheme_b_last_model.joblib'
|
|
|
|
|
|
SCHEME_B_SCALER_FILE = SCHEME_B_MODEL_DIR / 'scheme_b_scaler.joblib'
|
|
|
|
|
|
SCHEME_B_FEATURES_FILE = SCHEME_B_MODEL_DIR / 'scheme_b_features.json'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AIStrategy:
|
|
|
|
|
|
"""AI模型策略 — LightGBM / XGBoost Walk-Forward"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, model_type: str = 'lightgbm'):
|
|
|
|
|
|
"""
|
|
|
|
|
|
:param model_type: 'lightgbm' 或 'xgboost'
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.model_type = model_type
|
|
|
|
|
|
self.models = [] # 存储每个窗口训练的模型
|
2026-02-21 15:38:23 +08:00
|
|
|
|
self.scalers = [] # 与每个窗口模型对应的 scaler(若启用标准化)
|
|
|
|
|
|
self.last_scaler = None
|
2026-02-20 20:57:25 +08:00
|
|
|
|
self.feature_importance = None
|
|
|
|
|
|
|
|
|
|
|
|
def _create_model(self):
|
|
|
|
|
|
"""创建模型实例"""
|
|
|
|
|
|
if self.model_type == 'lightgbm':
|
|
|
|
|
|
import lightgbm as lgb
|
|
|
|
|
|
params = MC['lightgbm'].copy()
|
|
|
|
|
|
return lgb.LGBMClassifier(**params)
|
|
|
|
|
|
elif self.model_type == 'xgboost':
|
|
|
|
|
|
import xgboost as xgb
|
|
|
|
|
|
params = MC['xgboost'].copy()
|
|
|
|
|
|
return xgb.XGBClassifier(**params)
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise ValueError(f"不支持的模型类型: {self.model_type}")
|
|
|
|
|
|
|
|
|
|
|
|
def walk_forward_train(self, X: pd.DataFrame, y: pd.Series,
|
2026-02-21 15:38:23 +08:00
|
|
|
|
confidence_threshold: float = 0.45,
|
|
|
|
|
|
normalize: bool = True) -> pd.Series:
|
2026-02-20 20:57:25 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Walk-Forward 滚动训练与预测
|
|
|
|
|
|
:param confidence_threshold: 概率阈值,低于此值的预测设为0(观望)
|
|
|
|
|
|
:return: 全部测试窗口拼接的预测信号
|
|
|
|
|
|
"""
|
|
|
|
|
|
train_size = MC['walk_forward_train_size']
|
|
|
|
|
|
test_size = MC['walk_forward_test_size']
|
|
|
|
|
|
step = MC['walk_forward_step']
|
|
|
|
|
|
|
|
|
|
|
|
n = len(X)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
all_preds = pd.Series(dtype=int)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
window_count = 0
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"Walk-Forward: 数据量={n}, 训练窗口={train_size}, "
|
2026-02-21 15:38:23 +08:00
|
|
|
|
f"测试窗口={test_size}, 步长={step}, 置信阈值={confidence_threshold}, "
|
|
|
|
|
|
f"窗口内标准化={'是' if normalize else '否'}")
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
start = 0
|
|
|
|
|
|
while start + train_size + test_size <= n:
|
|
|
|
|
|
train_end = start + train_size
|
|
|
|
|
|
test_end = min(train_end + test_size, n)
|
|
|
|
|
|
|
2026-02-21 15:38:23 +08:00
|
|
|
|
X_train_raw = X.iloc[start:train_end]
|
2026-02-20 20:57:25 +08:00
|
|
|
|
y_train = y.iloc[start:train_end]
|
2026-02-21 15:38:23 +08:00
|
|
|
|
X_test_raw = X.iloc[train_end:test_end]
|
2026-02-20 20:57:25 +08:00
|
|
|
|
y_test = y.iloc[train_end:test_end]
|
|
|
|
|
|
|
2026-02-21 15:38:23 +08:00
|
|
|
|
# 每个窗口单独拟合 scaler,避免未来数据泄露
|
|
|
|
|
|
if normalize:
|
|
|
|
|
|
scaler = StandardScaler()
|
|
|
|
|
|
X_train = pd.DataFrame(
|
|
|
|
|
|
scaler.fit_transform(X_train_raw),
|
|
|
|
|
|
columns=X.columns,
|
|
|
|
|
|
index=X_train_raw.index,
|
|
|
|
|
|
)
|
|
|
|
|
|
X_test = pd.DataFrame(
|
|
|
|
|
|
scaler.transform(X_test_raw),
|
|
|
|
|
|
columns=X.columns,
|
|
|
|
|
|
index=X_test_raw.index,
|
|
|
|
|
|
)
|
|
|
|
|
|
else:
|
|
|
|
|
|
scaler = None
|
|
|
|
|
|
X_train = X_train_raw
|
|
|
|
|
|
X_test = X_test_raw
|
|
|
|
|
|
|
2026-02-21 15:20:33 +08:00
|
|
|
|
# 训练:使用类别平衡权重,避免模型偏向做多、很少出做空
|
2026-02-20 20:57:25 +08:00
|
|
|
|
model = self._create_model()
|
2026-02-21 15:20:33 +08:00
|
|
|
|
sw = compute_sample_weight('balanced', y_train)
|
|
|
|
|
|
model.fit(X_train, y_train, sample_weight=sw)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
self.models.append(model)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
self.scalers.append(scaler)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
# 预测概率 + 置信度过滤
|
|
|
|
|
|
proba = model.predict_proba(X_test)
|
|
|
|
|
|
max_proba = proba.max(axis=1)
|
|
|
|
|
|
raw_preds = model.predict(X_test)
|
|
|
|
|
|
|
|
|
|
|
|
# 置信度不够的设为观望
|
|
|
|
|
|
filtered_preds = raw_preds.copy()
|
|
|
|
|
|
filtered_preds[max_proba < confidence_threshold] = 0
|
|
|
|
|
|
|
|
|
|
|
|
preds = pd.Series(filtered_preds, index=X_test.index)
|
|
|
|
|
|
all_preds = pd.concat([all_preds, preds])
|
|
|
|
|
|
|
|
|
|
|
|
# 准确率(用原始预测算)
|
|
|
|
|
|
acc = (raw_preds == y_test).mean()
|
|
|
|
|
|
n_filtered = (max_proba < confidence_threshold).sum()
|
|
|
|
|
|
window_count += 1
|
|
|
|
|
|
logger.info(f" 窗口 {window_count}: 训练[{start}:{train_end}] "
|
|
|
|
|
|
f"测试[{train_end}:{test_end}] 准确率={acc:.2%} "
|
|
|
|
|
|
f"过滤={n_filtered}/{len(X_test)}")
|
|
|
|
|
|
|
|
|
|
|
|
start += step
|
|
|
|
|
|
|
|
|
|
|
|
# 特征重要性(取最后一个模型)
|
|
|
|
|
|
if self.models:
|
|
|
|
|
|
last_model = self.models[-1]
|
|
|
|
|
|
if hasattr(last_model, 'feature_importances_'):
|
|
|
|
|
|
self.feature_importance = pd.Series(
|
|
|
|
|
|
last_model.feature_importances_, index=X.columns
|
|
|
|
|
|
).sort_values(ascending=False)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
self.last_scaler = self.scalers[-1] if self.scalers else None
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
logger.info(f"Walk-Forward 完成: {window_count} 个窗口, "
|
|
|
|
|
|
f"共 {len(all_preds)} 条预测")
|
|
|
|
|
|
return all_preds
|
|
|
|
|
|
|
|
|
|
|
|
def get_top_features(self, n: int = 20) -> pd.Series:
|
|
|
|
|
|
"""获取Top N重要特征"""
|
|
|
|
|
|
if self.feature_importance is not None:
|
|
|
|
|
|
return self.feature_importance.head(n)
|
|
|
|
|
|
return pd.Series(dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
def run(self, period: int = None, start_date: str = None, end_date: str = None) -> dict:
|
|
|
|
|
|
"""
|
|
|
|
|
|
完整运行方案B
|
|
|
|
|
|
若指定了 start_date/end_date,会向前加载 warm_up_months 月数据用于训练,使回测区间首月即有预测。
|
|
|
|
|
|
:return: 回测结果
|
|
|
|
|
|
"""
|
|
|
|
|
|
if period is None:
|
|
|
|
|
|
period = PRIMARY_PERIOD
|
|
|
|
|
|
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
logger.info(f"方案B:AI模型策略 ({self.model_type})")
|
|
|
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
|
|
|
|
|
|
from .data_loader import load_kline
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 准备数据:若指定了回测区间,则向前加载预热数据,使区间内从首月就有预测
|
|
|
|
|
|
load_start, load_end = start_date, end_date
|
|
|
|
|
|
if start_date and end_date:
|
|
|
|
|
|
warm_months = MC.get('warm_up_months', 12)
|
|
|
|
|
|
load_start_ts = pd.Timestamp(start_date) - pd.DateOffset(months=warm_months)
|
|
|
|
|
|
load_start = load_start_ts.strftime('%Y-%m-%d')
|
|
|
|
|
|
logger.info(f"回测区间 [{start_date} ~ {end_date}],向前加载 {warm_months} 月至 {load_start} 用于训练")
|
|
|
|
|
|
|
2026-02-21 15:38:23 +08:00
|
|
|
|
# AI 策略使用窗口内标准化,prepare_dataset 此处返回未标准化特征,避免全样本泄露
|
|
|
|
|
|
X, y, feature_names, _ = prepare_dataset(period, load_start, load_end, normalize=False)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
# 2. Walk-Forward 训练
|
2026-02-21 15:38:23 +08:00
|
|
|
|
predictions = self.walk_forward_train(
|
|
|
|
|
|
X,
|
|
|
|
|
|
y,
|
|
|
|
|
|
confidence_threshold=MC.get('confidence_threshold', 0.45),
|
|
|
|
|
|
normalize=bool(FC.get('normalize', True)),
|
|
|
|
|
|
)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
# 3. 回测仅用用户指定区间;将预测对齐到该区间的每根K线
|
|
|
|
|
|
df = load_kline(period, start_date, end_date)
|
|
|
|
|
|
if df.empty:
|
|
|
|
|
|
logger.warning("回测区间内无K线数据")
|
|
|
|
|
|
return BacktestEngine()._empty_result()
|
|
|
|
|
|
|
|
|
|
|
|
# 对齐信号:回测区间内有的时间戳用预测,缺失的填 0(观望)
|
|
|
|
|
|
signals = predictions.reindex(df.index, fill_value=0).astype(int)
|
|
|
|
|
|
prices = df['close']
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 回测
|
|
|
|
|
|
engine = BacktestEngine()
|
|
|
|
|
|
result = engine.run(prices, signals)
|
|
|
|
|
|
|
|
|
|
|
|
print_metrics(result['metrics'], f"方案B: {self.model_type} AI策略")
|
|
|
|
|
|
|
2026-02-21 15:20:33 +08:00
|
|
|
|
# 4.1 打印每月收益(USDT)
|
|
|
|
|
|
if result.get('monthly_pnl') is not None and not result['monthly_pnl'].empty:
|
|
|
|
|
|
mp = result['monthly_pnl'].astype(float).round(2)
|
|
|
|
|
|
logger.info("\n" + "-" * 50)
|
|
|
|
|
|
logger.info(" 方案B 每月收益 (USDT)")
|
|
|
|
|
|
logger.info("-" * 50)
|
|
|
|
|
|
logger.info(f"\n{mp.to_string()}")
|
|
|
|
|
|
logger.info("-" * 50)
|
|
|
|
|
|
|
2026-02-20 20:57:25 +08:00
|
|
|
|
# 5. 保存最后一窗模型、scaler、特征列(供实盘 get_live_signal 使用)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
if self.models:
|
2026-02-20 20:57:25 +08:00
|
|
|
|
SCHEME_B_MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
joblib.dump(self.models[-1], SCHEME_B_MODEL_FILE)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
# 若未启用标准化,保存 None,实盘推理时将自动跳过 transform
|
|
|
|
|
|
joblib.dump(self.last_scaler, SCHEME_B_SCALER_FILE)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
with open(SCHEME_B_FEATURES_FILE, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
json.dump(feature_names, f, ensure_ascii=False)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
logger.info(
|
|
|
|
|
|
f"已保存方案B模型: {SCHEME_B_MODEL_FILE}, "
|
|
|
|
|
|
f"scaler={'启用' if self.last_scaler is not None else '无'}, "
|
|
|
|
|
|
f"{len(feature_names)} 个特征"
|
|
|
|
|
|
)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
|
|
|
|
|
|
# 6. 输出特征重要性
|
|
|
|
|
|
top_feat = self.get_top_features(15)
|
|
|
|
|
|
if not top_feat.empty:
|
|
|
|
|
|
logger.info("\nTop 15 重要特征:")
|
|
|
|
|
|
for i, (feat, imp) in enumerate(top_feat.items()):
|
|
|
|
|
|
logger.info(f" {i+1}. {feat}: {imp:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
result['feature_importance'] = self.feature_importance
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_ai_strategy(model_type: str = 'lightgbm', period: int = None,
|
|
|
|
|
|
start_date: str = None, end_date: str = None) -> dict:
|
|
|
|
|
|
"""方案B快捷入口"""
|
|
|
|
|
|
strategy = AIStrategy(model_type=model_type)
|
|
|
|
|
|
return strategy.run(period, start_date, end_date)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_live_signal(period: int = None, model_type: str = 'lightgbm',
|
|
|
|
|
|
start_date: str = None, end_date: str = None) -> int:
|
|
|
|
|
|
"""
|
|
|
|
|
|
使用已保存的方案B模型对当前最新K线生成信号(供实盘/模拟盘调用)。
|
|
|
|
|
|
需先运行过 run_ai_strategy 或 AIStrategy().run() 以生成 models/scheme_b_*.joblib 与 features.json。
|
|
|
|
|
|
:param period: K线主周期,默认 15
|
|
|
|
|
|
:param model_type: 未使用(模型已固定为磁盘上的 scheme_b_last_model.joblib)
|
|
|
|
|
|
:param start_date, end_date: 可选,限制 load_kline 范围
|
|
|
|
|
|
:return: 0=观望, 1=做多, 2=做空
|
|
|
|
|
|
"""
|
|
|
|
|
|
if period is None:
|
|
|
|
|
|
period = PRIMARY_PERIOD
|
2026-02-21 15:38:23 +08:00
|
|
|
|
if not SCHEME_B_MODEL_FILE.exists() or not SCHEME_B_FEATURES_FILE.exists():
|
2026-02-20 20:57:25 +08:00
|
|
|
|
logger.warning("方案B模型未找到,请先运行 AI 策略训练保存模型")
|
|
|
|
|
|
return 0
|
|
|
|
|
|
model = joblib.load(SCHEME_B_MODEL_FILE)
|
2026-02-21 15:38:23 +08:00
|
|
|
|
scaler = None
|
|
|
|
|
|
if SCHEME_B_SCALER_FILE.exists():
|
|
|
|
|
|
scaler = joblib.load(SCHEME_B_SCALER_FILE)
|
2026-02-20 20:57:25 +08:00
|
|
|
|
with open(SCHEME_B_FEATURES_FILE, 'r', encoding='utf-8') as f:
|
|
|
|
|
|
feature_cols = json.load(f)
|
|
|
|
|
|
X_last = get_latest_feature_row(period, feature_cols, start_date, end_date)
|
|
|
|
|
|
if X_last.empty:
|
|
|
|
|
|
return 0
|
2026-02-21 15:38:23 +08:00
|
|
|
|
if scaler is not None:
|
|
|
|
|
|
X_scaled = scaler.transform(X_last)
|
|
|
|
|
|
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_cols, index=X_last.index)
|
|
|
|
|
|
else:
|
|
|
|
|
|
X_scaled_df = X_last.copy()
|
2026-02-21 14:28:17 +08:00
|
|
|
|
proba = model.predict_proba(X_scaled_df)[0] # (p0, p1, p2)
|
|
|
|
|
|
pred = model.predict(X_scaled_df)[0]
|
|
|
|
|
|
confidence_threshold = MC.get('confidence_threshold', 0.45)
|
2026-02-21 15:20:33 +08:00
|
|
|
|
# 做空可单独使用更低阈值,避免模型偏向做多导致从不开空
|
|
|
|
|
|
threshold_short = MC.get('confidence_threshold_short')
|
|
|
|
|
|
pred_int = int(pred)
|
|
|
|
|
|
use_threshold = (threshold_short if threshold_short is not None and pred_int == 2 else confidence_threshold)
|
|
|
|
|
|
proba_pred = proba[pred_int]
|
|
|
|
|
|
logger.info(f"方案B 预测概率: 观望={proba[0]:.2f} 做多={proba[1]:.2f} 做空={proba[2]:.2f} -> {pred_int}")
|
|
|
|
|
|
if proba_pred < use_threshold:
|
|
|
|
|
|
if pred_int == 2:
|
|
|
|
|
|
logger.info(f"做空概率 {proba[2]:.2f} < {use_threshold},视为观望(可在 config 中设置 confidence_threshold_short=0.40 或重训)")
|
|
|
|
|
|
else:
|
|
|
|
|
|
logger.info(f"预测类别置信度 {proba_pred:.2f} < {use_threshold},视为观望")
|
2026-02-21 14:28:17 +08:00
|
|
|
|
return 0
|
2026-02-21 15:20:33 +08:00
|
|
|
|
return pred_int
|