443 lines
17 KiB
Python
443 lines
17 KiB
Python
"""
|
||
AI/ML 交易策略回测 — LightGBM + 30+技术指标
|
||
|
||
核心思路:
|
||
1. 用30+种技术指标作为特征(EMA/RSI/BB/MACD/ATR/K线形态/动量/波动率等)
|
||
2. 标签:未来N根K线的收益方向(涨>阈值=做多,跌>阈值=做空,否则=不交易)
|
||
3. 滚动训练:每月用过去3个月数据训练,预测下一个月
|
||
4. 只在模型高置信度时开仓(概率>阈值)
|
||
5. 同一时间只持1个仓
|
||
|
||
条件: 100U保证金, 100x杠杆, 90%返佣, >3分钟持仓
|
||
"""
|
||
import datetime
|
||
import sqlite3
|
||
import time as _time
|
||
from pathlib import Path
|
||
from collections import defaultdict
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import lightgbm as lgb
|
||
from sklearn.model_selection import TimeSeriesSplit
|
||
|
||
import warnings
|
||
warnings.filterwarnings('ignore')
|
||
|
||
# ==================== 数据加载 ====================
|
||
def load_data():
|
||
db = Path(__file__).parent.parent / 'models' / 'database.db'
|
||
s = int(datetime.datetime(2025,1,1).timestamp()) * 1000
|
||
e = int(datetime.datetime(2026,1,1).timestamp()) * 1000
|
||
conn = sqlite3.connect(str(db))
|
||
df = pd.read_sql_query(
|
||
f"SELECT id as ts, open, high, low, close FROM bitmart_eth_1m "
|
||
f"WHERE id >= {s} AND id < {e} ORDER BY id", conn)
|
||
conn.close()
|
||
df['datetime'] = pd.to_datetime(df['ts'], unit='ms')
|
||
df.set_index('datetime', inplace=True)
|
||
return df
|
||
|
||
# ==================== 特征工程 ====================
|
||
def add_features(df):
|
||
"""生成30+技术指标特征"""
|
||
c = df['close']; h = df['high']; l = df['low']; o = df['open']
|
||
|
||
# --- EMA ---
|
||
for p in [5, 8, 13, 21, 50, 120]:
|
||
df[f'ema_{p}'] = c.ewm(span=p, adjust=False).mean()
|
||
|
||
# EMA 相对位置
|
||
df['ema_fast_slow'] = (df['ema_8'] - df['ema_21']) / c # 快慢线差距
|
||
df['ema_slow_big'] = (df['ema_21'] - df['ema_120']) / c
|
||
df['price_vs_ema120'] = (c - df['ema_120']) / c
|
||
df['price_vs_ema50'] = (c - df['ema_50']) / c
|
||
df['ema8_slope'] = df['ema_8'].pct_change(5) # EMA斜率
|
||
df['ema21_slope'] = df['ema_21'].pct_change(5)
|
||
|
||
# --- RSI ---
|
||
for p in [7, 14, 21]:
|
||
delta = c.diff()
|
||
gain = delta.clip(lower=0)
|
||
loss = (-delta).clip(lower=0)
|
||
avg_gain = gain.rolling(p).mean()
|
||
avg_loss = loss.rolling(p).mean()
|
||
rs = avg_gain / avg_loss.replace(0, np.nan)
|
||
df[f'rsi_{p}'] = 100 - 100 / (1 + rs)
|
||
|
||
# --- Bollinger Bands ---
|
||
for p in [20]:
|
||
mid = c.rolling(p).mean()
|
||
std = c.rolling(p).std()
|
||
df['bb_upper'] = mid + 2 * std
|
||
df['bb_lower'] = mid - 2 * std
|
||
df['bb_mid'] = mid
|
||
df['bb_pct'] = (c - df['bb_lower']) / (df['bb_upper'] - df['bb_lower']).replace(0, np.nan)
|
||
df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / mid # 波动率
|
||
|
||
# --- MACD ---
|
||
ema12 = c.ewm(span=12, adjust=False).mean()
|
||
ema26 = c.ewm(span=26, adjust=False).mean()
|
||
df['macd'] = (ema12 - ema26) / c
|
||
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
|
||
df['macd_hist'] = df['macd'] - df['macd_signal']
|
||
|
||
# --- ATR ---
|
||
tr = pd.concat([
|
||
h - l,
|
||
(h - c.shift(1)).abs(),
|
||
(l - c.shift(1)).abs()
|
||
], axis=1).max(axis=1)
|
||
df['atr_14'] = tr.rolling(14).mean()
|
||
df['atr_pct'] = df['atr_14'] / c
|
||
df['atr_7'] = tr.rolling(7).mean() / c
|
||
|
||
# --- Stochastic ---
|
||
low14 = l.rolling(14).min()
|
||
high14 = h.rolling(14).max()
|
||
df['stoch_k'] = (c - low14) / (high14 - low14).replace(0, np.nan) * 100
|
||
df['stoch_d'] = df['stoch_k'].rolling(3).mean()
|
||
|
||
# --- 动量 ---
|
||
for p in [1, 3, 5, 10, 20, 60]:
|
||
df[f'ret_{p}'] = c.pct_change(p) # 过去N根收益率
|
||
|
||
# --- 波动率 ---
|
||
df['vol_5'] = c.pct_change().rolling(5).std()
|
||
df['vol_20'] = c.pct_change().rolling(20).std()
|
||
df['vol_ratio'] = df['vol_5'] / df['vol_20'].replace(0, np.nan)
|
||
|
||
# --- K线形态 ---
|
||
body = (c - o).abs()
|
||
df['body_pct'] = body / c # 实体占比
|
||
df['upper_shadow'] = (h - pd.concat([o, c], axis=1).max(axis=1)) / c
|
||
df['lower_shadow'] = (pd.concat([o, c], axis=1).min(axis=1) - l) / c
|
||
df['body_vs_range'] = body / (h - l).replace(0, np.nan) # 实体/全幅
|
||
df['is_bullish'] = (c > o).astype(float)
|
||
|
||
# 连续同向K线
|
||
bullish = (c > o).astype(int)
|
||
df['consec_bull'] = bullish.groupby((bullish != bullish.shift()).cumsum()).cumcount() + 1
|
||
df['consec_bull'] = df['consec_bull'] * bullish
|
||
bearish = (c < o).astype(int)
|
||
df['consec_bear'] = bearish.groupby((bearish != bearish.shift()).cumsum()).cumcount() + 1
|
||
df['consec_bear'] = df['consec_bear'] * bearish
|
||
|
||
# 吞没形态
|
||
prev_body = body.shift(1)
|
||
df['engulf_ratio'] = body / prev_body.replace(0, np.nan)
|
||
df['bullish_engulf'] = ((c.shift(1) < o.shift(1)) & (c > o) &
|
||
(c > o.shift(1)) & (o <= c.shift(1))).astype(float)
|
||
df['bearish_engulf'] = ((c.shift(1) > o.shift(1)) & (c < o) &
|
||
(c < o.shift(1)) & (o >= c.shift(1))).astype(float)
|
||
|
||
# 相对高低位置
|
||
df['high_20'] = h.rolling(20).max()
|
||
df['low_20'] = l.rolling(20).min()
|
||
df['price_position'] = (c - df['low_20']) / (df['high_20'] - df['low_20']).replace(0, np.nan)
|
||
|
||
# 小时/分钟时间特征
|
||
df['hour'] = df.index.hour
|
||
df['minute'] = df.index.minute
|
||
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
|
||
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
|
||
|
||
return df
|
||
|
||
# ==================== 标签生成 ====================
|
||
def add_labels(df, forward_bars=10, threshold=0.002):
|
||
"""
|
||
未来N根K线的收益:
|
||
> threshold → 1 (做多机会)
|
||
< -threshold → -1 (做空机会)
|
||
否则 → 0 (不交易)
|
||
"""
|
||
future_ret = df['close'].shift(-forward_bars) / df['close'] - 1
|
||
df['label'] = 0
|
||
df.loc[future_ret > threshold, 'label'] = 1
|
||
df.loc[future_ret < -threshold, 'label'] = -1
|
||
return df
|
||
|
||
# ==================== 模型训练 + 预测 ====================
|
||
def get_feature_cols(df):
|
||
exclude = {'ts', 'open', 'high', 'low', 'close', 'label',
|
||
'bb_upper', 'bb_lower', 'bb_mid', 'high_20', 'low_20',
|
||
'atr_14', 'ema_5', 'ema_8', 'ema_13', 'ema_21', 'ema_50', 'ema_120'}
|
||
return [c for c in df.columns if c not in exclude and df[c].dtype in ('float64','float32','int64','int32')]
|
||
|
||
def train_predict_walkforward(df, feature_cols, train_months=3):
|
||
"""
|
||
滚动训练:
|
||
用过去 train_months 个月训练 → 预测下一个月
|
||
从第4个月开始有预测
|
||
"""
|
||
df['month'] = df.index.to_period('M')
|
||
months = sorted(df['month'].unique())
|
||
|
||
all_preds = pd.Series(index=df.index, dtype=float)
|
||
all_preds[:] = 0.0 # 默认不交易
|
||
|
||
all_proba_long = pd.Series(index=df.index, dtype=float)
|
||
all_proba_short = pd.Series(index=df.index, dtype=float)
|
||
all_proba_long[:] = 0.0
|
||
all_proba_short[:] = 0.0
|
||
|
||
print(f"\n Walk-forward training ({len(months)} months, train={train_months}m):", flush=True)
|
||
|
||
for i in range(train_months, len(months)):
|
||
test_month = months[i]
|
||
train_start = months[i - train_months]
|
||
|
||
# 训练数据
|
||
train_mask = (df['month'] >= train_start) & (df['month'] < test_month)
|
||
test_mask = df['month'] == test_month
|
||
|
||
train_df = df[train_mask].dropna(subset=feature_cols + ['label'])
|
||
test_df = df[test_mask].dropna(subset=feature_cols)
|
||
|
||
if len(train_df) < 1000 or len(test_df) < 100:
|
||
print(f" {test_month}: skip (data insufficient)", flush=True)
|
||
continue
|
||
|
||
X_train = train_df[feature_cols].values
|
||
y_train = train_df['label'].values
|
||
|
||
X_test = test_df[feature_cols].values
|
||
|
||
# 将 -1,0,1 映射到 0,1,2 用于多分类
|
||
y_train_cls = y_train + 1 # -1→0, 0→1, 1→2
|
||
|
||
# LightGBM 训练
|
||
params = {
|
||
'objective': 'multiclass',
|
||
'num_class': 3,
|
||
'metric': 'multi_logloss',
|
||
'learning_rate': 0.05,
|
||
'num_leaves': 31,
|
||
'max_depth': 6,
|
||
'min_child_samples': 50,
|
||
'subsample': 0.8,
|
||
'colsample_bytree': 0.8,
|
||
'reg_alpha': 0.1,
|
||
'reg_lambda': 0.1,
|
||
'verbose': -1,
|
||
'n_jobs': -1,
|
||
'seed': 42,
|
||
}
|
||
|
||
dtrain = lgb.Dataset(X_train, label=y_train_cls)
|
||
model = lgb.train(params, dtrain, num_boost_round=200)
|
||
|
||
# 预测概率
|
||
proba = model.predict(X_test) # shape: (n, 3) → [P(short), P(neutral), P(long)]
|
||
|
||
test_idx = test_df.index
|
||
all_proba_short.loc[test_idx] = proba[:, 0] # P(short)
|
||
all_proba_long.loc[test_idx] = proba[:, 2] # P(long)
|
||
|
||
# 特征重要性(只打印最后一个月的)
|
||
if i == len(months) - 1:
|
||
importance = model.feature_importance(importance_type='gain')
|
||
feat_imp = sorted(zip(feature_cols, importance), key=lambda x: -x[1])
|
||
print(f"\n Top 10 features:", flush=True)
|
||
for fname, imp in feat_imp[:10]:
|
||
print(f" {fname:<20} {imp:.0f}", flush=True)
|
||
|
||
long_cnt = (proba[:, 2] > 0.45).sum()
|
||
short_cnt = (proba[:, 0] > 0.45).sum()
|
||
print(f" {test_month}: train={len(train_df):,} test={len(test_df):,} "
|
||
f"signals: long={long_cnt} short={short_cnt}", flush=True)
|
||
|
||
return all_proba_long, all_proba_short
|
||
|
||
# ==================== 回测引擎 ====================
|
||
def backtest(df, proba_long, proba_short, notional=10000.0,
|
||
prob_threshold=0.45, min_hold=200, max_hold=1800,
|
||
sl_pct=0.004, tp_pct=0.006):
|
||
FEE = notional * 0.0006 * 2
|
||
REB = FEE * 0.9
|
||
NFEE = FEE - REB
|
||
|
||
pos = 0; op = 0.0; ot = None
|
||
trades = []
|
||
|
||
for i in range(len(df)):
|
||
dt = df.index[i]
|
||
p = df['close'].iloc[i]
|
||
pl = proba_long.iloc[i]
|
||
ps = proba_short.iloc[i]
|
||
|
||
# 持仓管理
|
||
if pos != 0 and ot is not None:
|
||
pp = (p - op) / op if pos == 1 else (op - p) / op
|
||
hsec = (dt - ot).total_seconds()
|
||
|
||
# 硬止损
|
||
if -pp >= sl_pct * 1.5:
|
||
trades.append((pos, op, p, notional*pp, hsec, "硬止损", ot, dt))
|
||
pos=0; continue
|
||
|
||
if hsec >= min_hold:
|
||
if -pp >= sl_pct:
|
||
trades.append((pos, op, p, notional*pp, hsec, "止损", ot, dt))
|
||
pos=0; continue
|
||
if pp >= tp_pct:
|
||
trades.append((pos, op, p, notional*pp, hsec, "止盈", ot, dt))
|
||
pos=0; continue
|
||
if hsec >= max_hold:
|
||
trades.append((pos, op, p, notional*pp, hsec, "超时", ot, dt))
|
||
pos=0; continue
|
||
|
||
# AI反向信号平仓
|
||
if pos == 1 and ps > prob_threshold + 0.05:
|
||
trades.append((pos, op, p, notional*pp, hsec, "AI反转", ot, dt))
|
||
pos=0
|
||
elif pos == -1 and pl > prob_threshold + 0.05:
|
||
trades.append((pos, op, p, notional*pp, hsec, "AI反转", ot, dt))
|
||
pos=0
|
||
|
||
# 开仓
|
||
if pos == 0:
|
||
if pl > prob_threshold and pl > ps:
|
||
pos = 1; op = p; ot = dt
|
||
elif ps > prob_threshold and ps > pl:
|
||
pos = -1; op = p; ot = dt
|
||
|
||
if pos != 0:
|
||
p = df['close'].iloc[-1]; dt = df.index[-1]
|
||
pp = (p-op)/op if pos==1 else (op-p)/op
|
||
trades.append((pos, op, p, notional*pp, (dt-ot).total_seconds(), "结束", ot, dt))
|
||
|
||
return trades
|
||
|
||
# ==================== 结果分析 ====================
|
||
def analyze(trades, notional, label):
|
||
if not trades:
|
||
print(f" [{label}] No trades", flush=True); return 0
|
||
|
||
n = len(trades)
|
||
FEE = notional * 0.0006 * 2; REB = FEE * 0.9; NFEE = FEE - REB
|
||
total_pnl = sum(t[3] for t in trades)
|
||
net = total_pnl - NFEE * n
|
||
wins = len([t for t in trades if t[3]>0]); wr = wins/n*100
|
||
total_reb = REB * n
|
||
|
||
monthly = defaultdict(lambda: {'n':0,'net':0,'w':0})
|
||
for t in trades:
|
||
k = t[7].strftime('%Y-%m')
|
||
monthly[k]['n']+=1; monthly[k]['net']+=t[3]-NFEE
|
||
if t[3]>0: monthly[k]['w']+=1
|
||
|
||
cum=0;peak=0;dd=0
|
||
for t in trades:
|
||
cum+=t[3]-NFEE
|
||
if cum>peak: peak=cum
|
||
if peak-cum>dd: dd=peak-cum
|
||
|
||
pm = len([m for m in monthly.values() if m['net']>0])
|
||
|
||
reasons = defaultdict(int)
|
||
for t in trades: reasons[t[5]]+=1
|
||
|
||
print(f"\n{'='*75}", flush=True)
|
||
print(f" {label}", flush=True)
|
||
print(f"{'='*75}", flush=True)
|
||
print(f" 方向盈亏: {total_pnl:>+10.0f} USDT", flush=True)
|
||
print(f" 返佣: {total_reb:>+10.0f} USDT", flush=True)
|
||
print(f" 净手续费: {NFEE*n:>10.0f} USDT", flush=True)
|
||
print(f" ================================", flush=True)
|
||
print(f" 年净利: {net:>+10.0f} USDT (月均 {net/12:>+.0f})", flush=True)
|
||
print(f" 交易: {n}笔 | 胜率: {wr:.1f}% | 盈利月: {pm}/12", flush=True)
|
||
print(f" 最大回撤: {dd:>.0f} USDT", flush=True)
|
||
|
||
if wins > 0:
|
||
avg_win = sum(t[3] for t in trades if t[3]>0) / wins
|
||
avg_loss = sum(t[3] for t in trades if t[3]<=0) / (n-wins) if n>wins else 0
|
||
print(f" 均赢: {avg_win:>+.2f} | 均亏: {avg_loss:>+.2f} | 盈亏比: {abs(avg_win/avg_loss) if avg_loss!=0 else 999:.2f}", flush=True)
|
||
|
||
print(f"\n 平仓原因:", flush=True)
|
||
for r, cnt in sorted(reasons.items(), key=lambda x:-x[1]):
|
||
print(f" {r:<10} {cnt:>5}笔 ({cnt/n*100:.1f}%)", flush=True)
|
||
|
||
print(f"\n 月度:", flush=True)
|
||
for m in sorted(monthly.keys()):
|
||
d = monthly[m]; wr_m=d['w']/d['n']*100 if d['n']>0 else 0
|
||
bar = "+" * min(30, max(0, int(d['net']/100))) + "-" * min(30, max(0, int(-d['net']/100)))
|
||
print(f" {m} {d['n']:>4}笔 {d['net']:>+8.0f} {wr_m:>4.0f}% {bar}", flush=True)
|
||
print(f" {'合计':>7} {n:>4}笔 {net:>+8.0f}", flush=True)
|
||
|
||
print(f"\n 仓位放大:", flush=True)
|
||
for margin in [100, 300, 500, 800, 1000]:
|
||
scale = margin * 100 / notional
|
||
print(f" {margin}U: 月均 {net*scale/12:>+.0f} USDT {'<<< 达标' if net*scale/12>=1000 else ''}", flush=True)
|
||
|
||
print(f"{'='*75}", flush=True)
|
||
return net
|
||
|
||
# ==================== 主函数 ====================
|
||
def main():
|
||
t0 = _time.time()
|
||
print("="*75, flush=True)
|
||
print(" AI/ML 交易策略 — LightGBM + 30+技术指标", flush=True)
|
||
print("="*75, flush=True)
|
||
|
||
print("\n[1/4] 加载数据...", flush=True)
|
||
df = load_data()
|
||
print(f" {len(df):,} 根 1分钟K线", flush=True)
|
||
|
||
print("\n[2/4] 特征工程 (30+指标)...", flush=True)
|
||
df = add_features(df)
|
||
feature_cols = get_feature_cols(df)
|
||
print(f" 生成 {len(feature_cols)} 个特征", flush=True)
|
||
|
||
# 测试不同的前瞻期和阈值
|
||
configs = [
|
||
# (forward_bars, threshold, prob_threshold, sl, tp, label)
|
||
(5, 0.001, 0.42, 0.003, 0.004, "AI-v1: 5bar前瞻 阈值0.1%"),
|
||
(10, 0.002, 0.42, 0.004, 0.006, "AI-v2: 10bar前瞻 阈值0.2%"),
|
||
(10, 0.002, 0.45, 0.004, 0.006, "AI-v3: 10bar 高置信0.45"),
|
||
(10, 0.003, 0.45, 0.005, 0.008, "AI-v4: 10bar 阈值0.3% 宽SL"),
|
||
(20, 0.003, 0.42, 0.005, 0.008, "AI-v5: 20bar前瞻 阈值0.3%"),
|
||
(20, 0.004, 0.45, 0.005, 0.010, "AI-v6: 20bar 阈值0.4% 大TP"),
|
||
]
|
||
|
||
best_net = -999999; best_label = ""
|
||
|
||
for fb, thresh, prob_th, sl, tp, label in configs:
|
||
print(f"\n{'='*75}", flush=True)
|
||
print(f" [{label}]", flush=True)
|
||
print(f" 前瞻={fb}bar 方向阈值={thresh*100:.1f}% 概率阈值={prob_th} SL={sl*100:.1f}% TP={tp*100:.1f}%", flush=True)
|
||
print(f"{'='*75}", flush=True)
|
||
|
||
print("\n[3/4] 生成标签...", flush=True)
|
||
df_labeled = add_labels(df.copy(), forward_bars=fb, threshold=thresh)
|
||
labels = df_labeled['label']
|
||
print(f" 多={int((labels==1).sum()):,} 空={int((labels==-1).sum()):,} 中性={int((labels==0).sum()):,}", flush=True)
|
||
|
||
print("\n[4/4] 滚动训练+预测...", flush=True)
|
||
proba_long, proba_short = train_predict_walkforward(df_labeled, feature_cols, train_months=3)
|
||
|
||
print("\n 回测...", flush=True)
|
||
trades = backtest(df_labeled, proba_long, proba_short,
|
||
notional=10000.0, prob_threshold=prob_th,
|
||
sl_pct=sl, tp_pct=tp)
|
||
|
||
net = analyze(trades, 10000.0, label)
|
||
if net > best_net:
|
||
best_net = net; best_label = label
|
||
|
||
elapsed = _time.time() - t0
|
||
print(f"\n\n{'='*75}", flush=True)
|
||
print(f" 总结 | 耗时 {elapsed:.0f}s", flush=True)
|
||
print(f"{'='*75}", flush=True)
|
||
print(f" 最佳: {best_label}", flush=True)
|
||
print(f" 年净利: {best_net:+.0f} USDT = 月均 {best_net/12:+.0f} USDT", flush=True)
|
||
if best_net > 0:
|
||
needed = int(12000 / best_net * 100) + 1
|
||
print(f" 达到1000U/月需保证金: ~{needed}U", flush=True)
|
||
print(f"{'='*75}", flush=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|