lm_code/交易/bitmart-AI策略回测.py

"""
AI/ML 交易策略回测 — LightGBM + 30+技术指标

核心思路:
    1. 用30+种技术指标作为特征（EMA/RSI/BB/MACD/ATR/K线形态/动量/波动率等）
    2. 标签：未来N根K线的收益方向（涨>阈值=做多，跌>阈值=做空，否则=不交易）
    3. 滚动训练：每月用过去3个月数据训练，预测下一个月
    4. 只在模型高置信度时开仓（概率>阈值）
    5. 同一时间只持1个仓

条件: 100U保证金, 100x杠杆, 90%返佣, >3分钟持仓
"""
import datetime
import sqlite3
import time as _time
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

# ==================== 数据加载 ====================
def load_data():
    db = Path(__file__).parent.parent / 'models' / 'database.db'
    s = int(datetime.datetime(2025,1,1).timestamp()) * 1000
    e = int(datetime.datetime(2026,1,1).timestamp()) * 1000
    conn = sqlite3.connect(str(db))
    df = pd.read_sql_query(
        f"SELECT id as ts, open, high, low, close FROM bitmart_eth_1m "
        f"WHERE id >= {s} AND id < {e} ORDER BY id", conn)
    conn.close()
    df['datetime'] = pd.to_datetime(df['ts'], unit='ms')
    df.set_index('datetime', inplace=True)
    return df

# ==================== 特征工程 ====================
def add_features(df):
    """生成30+技术指标特征"""
    c = df['close']; h = df['high']; l = df['low']; o = df['open']

    # --- EMA ---
    for p in [5, 8, 13, 21, 50, 120]:
        df[f'ema_{p}'] = c.ewm(span=p, adjust=False).mean()

    # EMA 相对位置
    df['ema_fast_slow'] = (df['ema_8'] - df['ema_21']) / c  # 快慢线差距
    df['ema_slow_big'] = (df['ema_21'] - df['ema_120']) / c
    df['price_vs_ema120'] = (c - df['ema_120']) / c
    df['price_vs_ema50'] = (c - df['ema_50']) / c
    df['ema8_slope'] = df['ema_8'].pct_change(5)  # EMA斜率
    df['ema21_slope'] = df['ema_21'].pct_change(5)

    # --- RSI ---
    for p in [7, 14, 21]:
        delta = c.diff()
        gain = delta.clip(lower=0)
        loss = (-delta).clip(lower=0)
        avg_gain = gain.rolling(p).mean()
        avg_loss = loss.rolling(p).mean()
        rs = avg_gain / avg_loss.replace(0, np.nan)
        df[f'rsi_{p}'] = 100 - 100 / (1 + rs)

    # --- Bollinger Bands ---
    for p in [20]:
        mid = c.rolling(p).mean()
        std = c.rolling(p).std()
        df['bb_upper'] = mid + 2 * std
        df['bb_lower'] = mid - 2 * std
        df['bb_mid'] = mid
        df['bb_pct'] = (c - df['bb_lower']) / (df['bb_upper'] - df['bb_lower']).replace(0, np.nan)
        df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / mid  # 波动率

    # --- MACD ---
    ema12 = c.ewm(span=12, adjust=False).mean()
    ema26 = c.ewm(span=26, adjust=False).mean()
    df['macd'] = (ema12 - ema26) / c
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']

    # --- ATR ---
    tr = pd.concat([
        h - l,
        (h - c.shift(1)).abs(),
        (l - c.shift(1)).abs()
    ], axis=1).max(axis=1)
    df['atr_14'] = tr.rolling(14).mean()
    df['atr_pct'] = df['atr_14'] / c
    df['atr_7'] = tr.rolling(7).mean() / c

    # --- Stochastic ---
    low14 = l.rolling(14).min()
    high14 = h.rolling(14).max()
    df['stoch_k'] = (c - low14) / (high14 - low14).replace(0, np.nan) * 100
    df['stoch_d'] = df['stoch_k'].rolling(3).mean()

    # --- 动量 ---
    for p in [1, 3, 5, 10, 20, 60]:
        df[f'ret_{p}'] = c.pct_change(p)  # 过去N根收益率

    # --- 波动率 ---
    df['vol_5'] = c.pct_change().rolling(5).std()
    df['vol_20'] = c.pct_change().rolling(20).std()
    df['vol_ratio'] = df['vol_5'] / df['vol_20'].replace(0, np.nan)

    # --- K线形态 ---
    body = (c - o).abs()
    df['body_pct'] = body / c  # 实体占比
    df['upper_shadow'] = (h - pd.concat([o, c], axis=1).max(axis=1)) / c
    df['lower_shadow'] = (pd.concat([o, c], axis=1).min(axis=1) - l) / c
    df['body_vs_range'] = body / (h - l).replace(0, np.nan)  # 实体/全幅
    df['is_bullish'] = (c > o).astype(float)

    # 连续同向K线
    bullish = (c > o).astype(int)
    df['consec_bull'] = bullish.groupby((bullish != bullish.shift()).cumsum()).cumcount() + 1
    df['consec_bull'] = df['consec_bull'] * bullish
    bearish = (c < o).astype(int)
    df['consec_bear'] = bearish.groupby((bearish != bearish.shift()).cumsum()).cumcount() + 1
    df['consec_bear'] = df['consec_bear'] * bearish

    # 吞没形态
    prev_body = body.shift(1)
    df['engulf_ratio'] = body / prev_body.replace(0, np.nan)
    df['bullish_engulf'] = ((c.shift(1) < o.shift(1)) & (c > o) &
                            (c > o.shift(1)) & (o <= c.shift(1))).astype(float)
    df['bearish_engulf'] = ((c.shift(1) > o.shift(1)) & (c < o) &
                            (c < o.shift(1)) & (o >= c.shift(1))).astype(float)

    # 相对高低位置
    df['high_20'] = h.rolling(20).max()
    df['low_20'] = l.rolling(20).min()
    df['price_position'] = (c - df['low_20']) / (df['high_20'] - df['low_20']).replace(0, np.nan)

    # 小时/分钟时间特征
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    return df

# ==================== 标签生成 ====================
def add_labels(df, forward_bars=10, threshold=0.002):
    """
    未来N根K线的收益:
      > threshold  → 1 (做多机会)
      < -threshold → -1 (做空机会)
      否则 → 0 (不交易)
    """
    future_ret = df['close'].shift(-forward_bars) / df['close'] - 1
    df['label'] = 0
    df.loc[future_ret > threshold, 'label'] = 1
    df.loc[future_ret < -threshold, 'label'] = -1
    return df

# ==================== 模型训练 + 预测 ====================
def get_feature_cols(df):
    exclude = {'ts', 'open', 'high', 'low', 'close', 'label',
               'bb_upper', 'bb_lower', 'bb_mid', 'high_20', 'low_20',
               'atr_14', 'ema_5', 'ema_8', 'ema_13', 'ema_21', 'ema_50', 'ema_120'}
    return [c for c in df.columns if c not in exclude and df[c].dtype in ('float64','float32','int64','int32')]

def train_predict_walkforward(df, feature_cols, train_months=3):
    """
    滚动训练：
    用过去 train_months 个月训练 → 预测下一个月
    从第4个月开始有预测
    """
    df['month'] = df.index.to_period('M')
    months = sorted(df['month'].unique())

    all_preds = pd.Series(index=df.index, dtype=float)
    all_preds[:] = 0.0  # 默认不交易

    all_proba_long = pd.Series(index=df.index, dtype=float)
    all_proba_short = pd.Series(index=df.index, dtype=float)
    all_proba_long[:] = 0.0
    all_proba_short[:] = 0.0

    print(f"\n  Walk-forward training ({len(months)} months, train={train_months}m):", flush=True)

    for i in range(train_months, len(months)):
        test_month = months[i]
        train_start = months[i - train_months]

        # 训练数据
        train_mask = (df['month'] >= train_start) & (df['month'] < test_month)
        test_mask = df['month'] == test_month

        train_df = df[train_mask].dropna(subset=feature_cols + ['label'])
        test_df = df[test_mask].dropna(subset=feature_cols)

        if len(train_df) < 1000 or len(test_df) < 100:
            print(f"    {test_month}: skip (data insufficient)", flush=True)
            continue

        X_train = train_df[feature_cols].values
        y_train = train_df['label'].values

        X_test = test_df[feature_cols].values

        # 将 -1,0,1 映射到 0,1,2 用于多分类
        y_train_cls = y_train + 1  # -1→0, 0→1, 1→2

        # LightGBM 训练
        params = {
            'objective': 'multiclass',
            'num_class': 3,
            'metric': 'multi_logloss',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': 6,
            'min_child_samples': 50,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'verbose': -1,
            'n_jobs': -1,
            'seed': 42,
        }

        dtrain = lgb.Dataset(X_train, label=y_train_cls)
        model = lgb.train(params, dtrain, num_boost_round=200)

        # 预测概率
        proba = model.predict(X_test)  # shape: (n, 3) → [P(short), P(neutral), P(long)]

        test_idx = test_df.index
        all_proba_short.loc[test_idx] = proba[:, 0]  # P(short)
        all_proba_long.loc[test_idx] = proba[:, 2]   # P(long)

        # 特征重要性（只打印最后一个月的）
        if i == len(months) - 1:
            importance = model.feature_importance(importance_type='gain')
            feat_imp = sorted(zip(feature_cols, importance), key=lambda x: -x[1])
            print(f"\n  Top 10 features:", flush=True)
            for fname, imp in feat_imp[:10]:
                print(f"    {fname:<20} {imp:.0f}", flush=True)

        long_cnt = (proba[:, 2] > 0.45).sum()
        short_cnt = (proba[:, 0] > 0.45).sum()
        print(f"    {test_month}: train={len(train_df):,} test={len(test_df):,} "
              f"signals: long={long_cnt} short={short_cnt}", flush=True)

    return all_proba_long, all_proba_short

# ==================== 回测引擎 ====================
def backtest(df, proba_long, proba_short, notional=10000.0,
             prob_threshold=0.45, min_hold=200, max_hold=1800,
             sl_pct=0.004, tp_pct=0.006):
    FEE = notional * 0.0006 * 2
    REB = FEE * 0.9
    NFEE = FEE - REB

    pos = 0; op = 0.0; ot = None
    trades = []

    for i in range(len(df)):
        dt = df.index[i]
        p = df['close'].iloc[i]
        pl = proba_long.iloc[i]
        ps = proba_short.iloc[i]

        # 持仓管理
        if pos != 0 and ot is not None:
            pp = (p - op) / op if pos == 1 else (op - p) / op
            hsec = (dt - ot).total_seconds()

            # 硬止损
            if -pp >= sl_pct * 1.5:
                trades.append((pos, op, p, notional*pp, hsec, "硬止损", ot, dt))
                pos=0; continue

            if hsec >= min_hold:
                if -pp >= sl_pct:
                    trades.append((pos, op, p, notional*pp, hsec, "止损", ot, dt))
                    pos=0; continue
                if pp >= tp_pct:
                    trades.append((pos, op, p, notional*pp, hsec, "止盈", ot, dt))
                    pos=0; continue
                if hsec >= max_hold:
                    trades.append((pos, op, p, notional*pp, hsec, "超时", ot, dt))
                    pos=0; continue

                # AI反向信号平仓
                if pos == 1 and ps > prob_threshold + 0.05:
                    trades.append((pos, op, p, notional*pp, hsec, "AI反转", ot, dt))
                    pos=0
                elif pos == -1 and pl > prob_threshold + 0.05:
                    trades.append((pos, op, p, notional*pp, hsec, "AI反转", ot, dt))
                    pos=0

        # 开仓
        if pos == 0:
            if pl > prob_threshold and pl > ps:
                pos = 1; op = p; ot = dt
            elif ps > prob_threshold and ps > pl:
                pos = -1; op = p; ot = dt

    if pos != 0:
        p = df['close'].iloc[-1]; dt = df.index[-1]
        pp = (p-op)/op if pos==1 else (op-p)/op
        trades.append((pos, op, p, notional*pp, (dt-ot).total_seconds(), "结束", ot, dt))

    return trades

# ==================== 结果分析 ====================
def analyze(trades, notional, label):
    if not trades:
        print(f"  [{label}] No trades", flush=True); return 0

    n = len(trades)
    FEE = notional * 0.0006 * 2; REB = FEE * 0.9; NFEE = FEE - REB
    total_pnl = sum(t[3] for t in trades)
    net = total_pnl - NFEE * n
    wins = len([t for t in trades if t[3]>0]); wr = wins/n*100
    total_reb = REB * n

    monthly = defaultdict(lambda: {'n':0,'net':0,'w':0})
    for t in trades:
        k = t[7].strftime('%Y-%m')
        monthly[k]['n']+=1; monthly[k]['net']+=t[3]-NFEE
        if t[3]>0: monthly[k]['w']+=1

    cum=0;peak=0;dd=0
    for t in trades:
        cum+=t[3]-NFEE
        if cum>peak: peak=cum
        if peak-cum>dd: dd=peak-cum

    pm = len([m for m in monthly.values() if m['net']>0])

    reasons = defaultdict(int)
    for t in trades: reasons[t[5]]+=1

    print(f"\n{'='*75}", flush=True)
    print(f"  {label}", flush=True)
    print(f"{'='*75}", flush=True)
    print(f"  方向盈亏: {total_pnl:>+10.0f} USDT", flush=True)
    print(f"  返佣:     {total_reb:>+10.0f} USDT", flush=True)
    print(f"  净手续费: {NFEE*n:>10.0f} USDT", flush=True)
    print(f"  ================================", flush=True)
    print(f"  年净利:   {net:>+10.0f} USDT (月均 {net/12:>+.0f})", flush=True)
    print(f"  交易:     {n}笔 | 胜率: {wr:.1f}% | 盈利月: {pm}/12", flush=True)
    print(f"  最大回撤: {dd:>.0f} USDT", flush=True)

    if wins > 0:
        avg_win = sum(t[3] for t in trades if t[3]>0) / wins
        avg_loss = sum(t[3] for t in trades if t[3]<=0) / (n-wins) if n>wins else 0
        print(f"  均赢: {avg_win:>+.2f} | 均亏: {avg_loss:>+.2f} | 盈亏比: {abs(avg_win/avg_loss) if avg_loss!=0 else 999:.2f}", flush=True)

    print(f"\n  平仓原因:", flush=True)
    for r, cnt in sorted(reasons.items(), key=lambda x:-x[1]):
        print(f"    {r:<10} {cnt:>5}笔 ({cnt/n*100:.1f}%)", flush=True)

    print(f"\n  月度:", flush=True)
    for m in sorted(monthly.keys()):
        d = monthly[m]; wr_m=d['w']/d['n']*100 if d['n']>0 else 0
        bar = "+" * min(30, max(0, int(d['net']/100))) + "-" * min(30, max(0, int(-d['net']/100)))
        print(f"  {m} {d['n']:>4}笔 {d['net']:>+8.0f} {wr_m:>4.0f}% {bar}", flush=True)
    print(f"  {'合计':>7} {n:>4}笔 {net:>+8.0f}", flush=True)

    print(f"\n  仓位放大:", flush=True)
    for margin in [100, 300, 500, 800, 1000]:
        scale = margin * 100 / notional
        print(f"    {margin}U: 月均 {net*scale/12:>+.0f} USDT {'<<< 达标' if net*scale/12>=1000 else ''}", flush=True)

    print(f"{'='*75}", flush=True)
    return net

# ==================== 主函数 ====================
def main():
    t0 = _time.time()
    print("="*75, flush=True)
    print("  AI/ML 交易策略 — LightGBM + 30+技术指标", flush=True)
    print("="*75, flush=True)

    print("\n[1/4] 加载数据...", flush=True)
    df = load_data()
    print(f"  {len(df):,} 根 1分钟K线", flush=True)

    print("\n[2/4] 特征工程 (30+指标)...", flush=True)
    df = add_features(df)
    feature_cols = get_feature_cols(df)
    print(f"  生成 {len(feature_cols)} 个特征", flush=True)

    # 测试不同的前瞻期和阈值
    configs = [
        # (forward_bars, threshold, prob_threshold, sl, tp, label)
        (5,  0.001, 0.42, 0.003, 0.004, "AI-v1: 5bar前瞻 阈值0.1%"),
        (10, 0.002, 0.42, 0.004, 0.006, "AI-v2: 10bar前瞻 阈值0.2%"),
        (10, 0.002, 0.45, 0.004, 0.006, "AI-v3: 10bar 高置信0.45"),
        (10, 0.003, 0.45, 0.005, 0.008, "AI-v4: 10bar 阈值0.3% 宽SL"),
        (20, 0.003, 0.42, 0.005, 0.008, "AI-v5: 20bar前瞻 阈值0.3%"),
        (20, 0.004, 0.45, 0.005, 0.010, "AI-v6: 20bar 阈值0.4% 大TP"),
    ]

    best_net = -999999; best_label = ""

    for fb, thresh, prob_th, sl, tp, label in configs:
        print(f"\n{'='*75}", flush=True)
        print(f"  [{label}]", flush=True)
        print(f"  前瞻={fb}bar 方向阈值={thresh*100:.1f}% 概率阈值={prob_th} SL={sl*100:.1f}% TP={tp*100:.1f}%", flush=True)
        print(f"{'='*75}", flush=True)

        print("\n[3/4] 生成标签...", flush=True)
        df_labeled = add_labels(df.copy(), forward_bars=fb, threshold=thresh)
        labels = df_labeled['label']
        print(f"  多={int((labels==1).sum()):,} 空={int((labels==-1).sum()):,} 中性={int((labels==0).sum()):,}", flush=True)

        print("\n[4/4] 滚动训练+预测...", flush=True)
        proba_long, proba_short = train_predict_walkforward(df_labeled, feature_cols, train_months=3)

        print("\n  回测...", flush=True)
        trades = backtest(df_labeled, proba_long, proba_short,
                         notional=10000.0, prob_threshold=prob_th,
                         sl_pct=sl, tp_pct=tp)

        net = analyze(trades, 10000.0, label)
        if net > best_net:
            best_net = net; best_label = label

    elapsed = _time.time() - t0
    print(f"\n\n{'='*75}", flush=True)
    print(f"  总结 | 耗时 {elapsed:.0f}s", flush=True)
    print(f"{'='*75}", flush=True)
    print(f"  最佳: {best_label}", flush=True)
    print(f"  年净利: {best_net:+.0f} USDT = 月均 {best_net/12:+.0f} USDT", flush=True)
    if best_net > 0:
        needed = int(12000 / best_net * 100) + 1
        print(f"  达到1000U/月需保证金: ~{needed}U", flush=True)
    print(f"{'='*75}", flush=True)


if __name__ == '__main__':
    main()