diff --git a/README.md b/README.md index 7e459e4..a18db60 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,160 @@ -# btc-ml-optimizer +# BTC ML Trading Strategy Optimizer -Autonomous ML trading strategy optimizer with LLM-in-the-loop. XGBoost/LightGBM on GPU (RTX 4070 Ti) + qwen3.5:27b on Mac Mini for strategy analysis. Walk-forward backtesting on BTC OHLCV data. \ No newline at end of file +An automated optimization loop that trains ML models on BTC/USDT data, backtests trading strategies, and uses an LLM to iteratively improve the configuration. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Optimization Loop │ +│ │ +│ ┌──────────┐ ┌───────────────┐ ┌──────────────────────┐ │ +│ │ VPS │───>│ Windows PC │───>│ Mac Mini │ │ +│ │ (Orch.) │<───│ (GPU/ML) │ │ (LLM) │ │ +│ │ │<───────────────────────>│ │ │ +│ │ - Fetch │ │ - XGBoost │ │ - Ollama │ │ +│ │ data │ │ - LightGBM │ │ - qwen3.5:27b │ │ +│ │ - Coord │ │ - CatBoost │ │ - Analyze results │ │ +│ │ - Store │ │ - RTX 4070 Ti │ │ - Suggest changes │ │ +│ └──────────┘ └───────────────┘ └──────────────────────┘ │ +│ ▲ │ │ +│ └────────────────────────────────────────┘ │ +│ Modified config │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Machines (Tailscale) + +| Machine | Role | Address | Key Resources | +|------------|-------------|-------------------|---------------------| +| VPS | Orchestrator | localhost | Coordination, data | +| Windows PC | ML Engine | 100.76.218.38 | RTX 4070 Ti GPU | +| Mac Mini | LLM | 100.100.242.21 | Ollama, qwen3.5:27b | + +## Directory Structure + +``` +btc-ml-optimizer/ +├── orchestrator.py # Main loop — coordinates everything +├── ml_engine/ +│ └── train_and_backtest.py # Self-contained ML script (runs on Windows) +├── llm_client/ +│ └── analyzer.py # LLM strategy analyzer (calls Mac Mini) +├── scripts/ +│ ├── fetch_data.py # BTC/USDT data fetcher (ccxt) +│ └── setup_windows.sh # Install deps on Windows PC +├── config/ +│ └── initial_config.json # Starting configuration +├── data/ # OHLCV CSV files +├── results/ # Iteration results + logs +├── requirements_vps.txt # VPS Python dependencies +└── requirements_windows.txt # Windows PC Python dependencies +``` + +## Setup + +### 1. VPS (this machine) + +```bash +pip install -r requirements_vps.txt +``` + +### 2. Windows PC + +```bash +# From VPS — installs all ML deps on Windows via SSH +bash scripts/setup_windows.sh +``` + +Or manually on Windows: +```bash +pip install -r requirements_windows.txt +``` + +### 3. Mac Mini + +Ensure Ollama is running with the qwen3.5:27b model: +```bash +ollama pull qwen3.5:27b +ollama serve # should already be running +``` + +## Usage + +### Fetch Data + +```bash +python3 scripts/fetch_data.py +``` + +Downloads 2 years of BTC/USDT 1h and 4h OHLCV data from Binance. + +### Run the Optimizer + +```bash +python3 orchestrator.py +``` + +The optimizer will: +1. Ensure data is fetched +2. Upload ML engine + data to Windows PC +3. Train model and backtest on GPU +4. Send results to LLM for analysis +5. Apply LLM-suggested config changes +6. Repeat until convergence (or 50 iterations) + +### Run ML Engine Standalone (on Windows) + +```bash +python train_and_backtest.py --config config.json --data btc_4h.csv --output results.json +``` + +## Configuration Reference + +### `model_type` +- `xgboost` — XGBoost with GPU (default, generally best) +- `lightgbm` — LightGBM with GPU (faster training) +- `catboost` — CatBoost with GPU (handles interactions well) +- `ensemble` — Soft voting of all three + +### `features` +- `technical_indicators` — List of indicators to compute +- `lookback_periods` — Windows for return/volatility features +- `use_volume_features` — Include volume-derived features +- `use_volatility_features` — Include volatility features +- `use_candle_patterns` — Include candlestick pattern features +- `use_lag_features` — Include lagged feature values +- `lag_periods` — Specific lag periods to use + +### `target` +- `direction` — `"long"` or `"both"` +- `horizon_candles` — Forward-looking prediction window +- `threshold_pct` — Minimum % move to label as positive + +### `hyperparameters` +Standard gradient boosting params: `learning_rate`, `max_depth`, `n_estimators`, `subsample`, `colsample_bytree`, `min_child_weight`, `gamma`, `reg_alpha`, `reg_lambda` + +### `strategy` +- `entry_threshold` — Min probability to enter trade (0.5-0.8) +- `stop_loss_pct` — Stop loss percentage +- `take_profit_pct` — Take profit percentage +- `trailing_stop_pct` — Trailing stop distance +- `position_sizing` — `"confidence_scaled"` or `"fixed"` +- `min_confidence_to_trade` — Absolute minimum confidence + +### `training` +- `walk_forward_windows` — Number of walk-forward splits (3-10) +- `train_pct` / `validation_pct` / `test_pct` — Data split ratios + +## Convergence Criteria + +The optimizer stops when: +- Sharpe ratio exceeds 3.0 +- Sharpe improvement < 1% over 5 consecutive iterations +- Maximum 50 iterations reached + +## Output + +- `config/best_config.json` — Best configuration found +- `results/iterations.jsonl` — Full log of every iteration +- `results/results_iter_N.json` — Detailed results per iteration diff --git a/config/initial_config.json b/config/initial_config.json new file mode 100644 index 0000000..5b7a756 --- /dev/null +++ b/config/initial_config.json @@ -0,0 +1,59 @@ +{ + "model_type": "xgboost", + "features": { + "technical_indicators": [ + "RSI_14", "RSI_7", "RSI_21", + "MACD_line", "MACD_signal", "MACD_hist", + "BB_upper", "BB_lower", "BB_width", + "ATR_14", + "SMA_5", "SMA_10", "SMA_20", "SMA_50", "SMA_200", + "EMA_5", "EMA_10", "EMA_20", "EMA_50", + "OBV", + "stoch_k", "stoch_d", + "williams_r", + "CCI_20", + "ROC_10", + "keltner_upper", "keltner_lower" + ], + "lookback_periods": [3, 5, 10, 20], + "use_volume_features": true, + "use_volatility_features": true, + "use_candle_patterns": true, + "use_lag_features": true, + "lag_periods": [1, 2, 3, 5] + }, + "target": { + "type": "classification", + "direction": "long", + "horizon_candles": 6, + "threshold_pct": 1.0 + }, + "hyperparameters": { + "learning_rate": 0.05, + "max_depth": 6, + "n_estimators": 500, + "subsample": 0.8, + "colsample_bytree": 0.8, + "min_child_weight": 5, + "gamma": 0.1, + "reg_alpha": 0.1, + "reg_lambda": 1.0 + }, + "strategy": { + "entry_threshold": 0.60, + "exit_type": "trailing_stop", + "stop_loss_pct": 2.0, + "take_profit_pct": 4.0, + "trailing_stop_pct": 1.5, + "position_sizing": "confidence_scaled", + "max_position_pct": 100, + "min_confidence_to_trade": 0.55 + }, + "training": { + "walk_forward_windows": 5, + "train_pct": 0.7, + "validation_pct": 0.15, + "test_pct": 0.15 + }, + "timeframe": "4h" +} diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/llm_client/__init__.py b/llm_client/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llm_client/analyzer.py b/llm_client/analyzer.py new file mode 100755 index 0000000..3911940 --- /dev/null +++ b/llm_client/analyzer.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +""" +LLM Strategy Analyzer — Calls Ollama on Mac Mini to analyze results +and suggest config modifications for the next iteration. +""" + +import json +import re +import requests + +OLLAMA_URL = "http://100.100.242.21:11434" +MODEL = "qwen3.5:27b" + +SYSTEM_PROMPT = """You are a quantitative trading strategy optimizer. You analyze ML model backtesting results for a BTC/USDT trading strategy and suggest precise modifications to improve performance. + +## Your Task +Given the current configuration and results, suggest 1-3 specific, justified changes to the configuration for the next iteration. Be methodical and scientific — change one thing at a time when possible. + +## Config Parameters You Can Modify + +**model_type**: "xgboost", "lightgbm", "catboost", or "ensemble" + - xgboost: Generally best for structured data, fast GPU training + - lightgbm: Faster training, good with large feature sets + - catboost: Handles feature interactions well, less tuning needed + - ensemble: Combines all three, reduces variance but slower + +**hyperparameters**: + - learning_rate (0.001-0.3): Lower = more robust but slower. If overfitting, decrease. + - max_depth (3-10): Controls model complexity. Deeper = more overfitting risk. + - n_estimators (100-2000): More trees = better fit but diminishing returns. + - subsample (0.5-1.0): Row sampling. Lower = more regularization. + - colsample_bytree (0.5-1.0): Feature sampling per tree. Lower = more diversity. + - min_child_weight (1-20): Higher = more conservative splits. + - gamma (0-5): Minimum loss reduction for split. Higher = more pruning. + - reg_alpha (0-10): L1 regularization. Encourages sparsity. + - reg_lambda (0-10): L2 regularization. Prevents large weights. + +**target**: + - direction: "long" or "both" + - horizon_candles (1-20): How far ahead to predict. Longer = smoother but lagging. + - threshold_pct (0.3-3.0): Minimum move % to label as positive. Higher = fewer but clearer signals. + +**strategy**: + - entry_threshold (0.5-0.8): Min prediction probability to enter trade. Higher = fewer trades, higher quality. + - stop_loss_pct (0.5-5.0): Max loss before exit. Tighter = more stopped out. + - take_profit_pct (1.0-10.0): Target profit. Should be > stop_loss for positive expectancy. + - trailing_stop_pct (0.5-3.0): Trailing stop distance. Tighter = locks profit faster but exits early. + - min_confidence_to_trade (0.5-0.9): Absolute minimum confidence to consider. + - exit_type: "trailing_stop" or "fixed" (just SL/TP) + +**features**: + - use_volume_features (true/false): Volume features can be noisy in crypto. + - use_candle_patterns (true/false): Candle patterns may or may not help. + - use_lag_features (true/false): Lagged features capture momentum. + - lag_periods: List of lag periods [1,2,3,5,10] + - lookback_periods: List of lookback windows [3,5,10,20] + +**training**: + - walk_forward_windows (3-10): More windows = more robust but less data per window. + +## Key Metrics to Optimize (in priority order) +1. **Sharpe Ratio** (target: > 2.0): Risk-adjusted return. Most important metric. +2. **Profit Factor** (target: > 1.5): Gross profit / gross loss. +3. **Max Drawdown** (target: > -15%): Worst peak-to-trough decline. +4. **Win Rate** (target: > 55%): Percentage of winning trades. +5. **Trade Count**: Need enough trades for statistical significance (>50). + +## Decision Guidelines +- If Sharpe < 1.0: The strategy is not working well. Consider larger changes. +- If Sharpe 1.0-1.5: Decent. Fine-tune hyperparameters and thresholds. +- If Sharpe 1.5-2.0: Good. Make small, targeted improvements. +- If Sharpe > 2.0: Very good. Be careful not to overfit. +- If win_rate < 0.50 but profit_factor > 1.5: Strategy relies on big wins — ok, tighten SL. +- If win_rate > 0.60 but profit_factor < 1.2: Many small wins but losses are too big — widen TP or tighten SL. +- If trade_count < 30: Not enough trades. Lower entry_threshold or min_confidence. +- If max_drawdown < -20%: Too risky. Increase regularization, tighten stop loss. +- If per_window_sharpe has high variance: Model is not stable. More regularization or simpler model. +- Check feature_importances: If top features make financial sense, good. If random features dominate, possible overfitting. + +## Response Format +You MUST respond with ONLY a JSON object (no markdown, no explanation outside the JSON): +``` +{ + "reasoning": "Explanation of what you observed and why you're making these changes", + "changes": ["Change 1 description", "Change 2 description"], + "config": { } +} +``` +The "config" field must contain the COMPLETE config (not just changes) so it can be used directly.""" + + +def analyze_and_suggest(current_config: dict, results: dict, + iteration_history: list = None) -> tuple[dict, str]: + """ + Send current results to LLM and get suggested config modifications. + Returns (new_config, reasoning). + """ + # Build the user prompt with context + history_text = "" + if iteration_history: + history_text = "\n## Previous Iterations (most recent last)\n" + for h in iteration_history[-5:]: + history_text += ( + f"- Iteration {h['iteration']}: Sharpe={h['sharpe']}, " + f"Return={h['return']}%, WinRate={h['win_rate']}, " + f"Trades={h['trades']}, Model={h['model_type']}\n" + ) + + user_prompt = f"""## Current Configuration +```json +{json.dumps(current_config, indent=2)} +``` + +## Current Results +- Sharpe Ratio: {results.get('sharpe_ratio', 0)} +- Total Return: {results.get('total_return_pct', 0)}% +- Max Drawdown: {results.get('max_drawdown_pct', 0)}% +- Win Rate: {results.get('win_rate', 0)} +- Trade Count: {results.get('trade_count', 0)} +- Profit Factor: {results.get('profit_factor', 0)} +- Avg Trade Duration: {results.get('avg_trade_duration_candles', 0)} candles +- Per-Window Sharpe: {results.get('per_window_sharpe', [])} + +## Top Feature Importances +{json.dumps(dict(list(results.get('feature_importances', {}).items())[:15]), indent=2)} +{history_text} +Analyze these results and suggest 1-3 specific modifications to the config. Return ONLY valid JSON.""" + + # Call Ollama + payload = { + "model": MODEL, + "messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + "stream": False, + "options": { + "temperature": 0.7, + "num_predict": 4096, + }, + } + + print(f" Calling LLM ({MODEL} on Mac Mini)...") + resp = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=300) + resp.raise_for_status() + content = resp.json()["message"]["content"] + + # Parse JSON from response (handle markdown code blocks) + # Strip thinking tags if present + content = re.sub(r".*?", "", content, flags=re.DOTALL).strip() + + json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", content, re.DOTALL) + if json_match: + parsed = json.loads(json_match.group(1)) + else: + # Try parsing the whole response as JSON + # Find the outermost JSON object + brace_start = content.find("{") + if brace_start >= 0: + depth = 0 + for i in range(brace_start, len(content)): + if content[i] == "{": + depth += 1 + elif content[i] == "}": + depth -= 1 + if depth == 0: + parsed = json.loads(content[brace_start:i + 1]) + break + else: + raise ValueError("Could not find complete JSON in LLM response") + else: + raise ValueError(f"No JSON found in LLM response: {content[:200]}") + + reasoning = parsed.get("reasoning", "No reasoning provided") + changes = parsed.get("changes", []) + new_config = parsed.get("config", current_config) + + # Validate that config has required fields + required_keys = ["model_type", "features", "target", "hyperparameters", "strategy", "training"] + for key in required_keys: + if key not in new_config: + new_config[key] = current_config[key] + + change_summary = f"{reasoning}\nChanges: {', '.join(changes)}" + return new_config, change_summary + + +if __name__ == "__main__": + # Test with dummy data + import sys + config_path = sys.argv[1] if len(sys.argv) > 1 else "config/initial_config.json" + with open(config_path) as f: + config = json.load(f) + + dummy_results = { + "sharpe_ratio": 1.2, + "total_return_pct": 15.3, + "max_drawdown_pct": -12.5, + "win_rate": 0.55, + "trade_count": 120, + "profit_factor": 1.4, + "avg_trade_duration_candles": 7.2, + "feature_importances": {"RSI_14": 0.15, "MACD_hist": 0.12, "BB_width": 0.10}, + "per_window_sharpe": [1.0, 1.3, 1.5, 0.9, 1.1], + } + + new_config, reasoning = analyze_and_suggest(config, dummy_results) + print(f"\nReasoning: {reasoning}") + print(f"\nNew config:\n{json.dumps(new_config, indent=2)}") diff --git a/ml_engine/__init__.py b/ml_engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml_engine/train_and_backtest.py b/ml_engine/train_and_backtest.py new file mode 100755 index 0000000..8853ab2 --- /dev/null +++ b/ml_engine/train_and_backtest.py @@ -0,0 +1,537 @@ +#!/usr/bin/env python3 +""" +BTC ML Trading Strategy — Train & Backtest Engine +Self-contained script that runs on the Windows PC with GPU. + +Usage: + python train_and_backtest.py --config config.json --data btc_4h.csv --output results.json +""" + +import argparse +import json +import sys +import warnings +import numpy as np +import pandas as pd +from datetime import datetime + +import ta +from ta.momentum import RSIIndicator, StochasticOscillator, WilliamsRIndicator, ROCIndicator +from ta.trend import MACD, CCIIndicator, SMAIndicator, EMAIndicator +from ta.volatility import BollingerBands, AverageTrueRange, KeltnerChannel +from ta.volume import OnBalanceVolumeIndicator + +warnings.filterwarnings("ignore") + +# --------------------------------------------------------------------------- +# Feature Engineering +# --------------------------------------------------------------------------- + +def compute_features(df: pd.DataFrame, config: dict) -> pd.DataFrame: + """Compute 60+ technical features from OHLCV data.""" + feat = config.get("features", {}) + c, h, l, o, v = df["close"], df["high"], df["low"], df["open"], df["volume"] + + # --- Price SMAs & EMAs --- + for p in [5, 10, 20, 50, 200]: + df[f"SMA_{p}"] = SMAIndicator(c, window=p).sma_indicator() + df[f"price_vs_SMA_{p}"] = c / df[f"SMA_{p}"] - 1 + for p in [5, 10, 20, 50]: + df[f"EMA_{p}"] = EMAIndicator(c, window=p).ema_indicator() + + # --- Momentum --- + for p in [7, 14, 21]: + df[f"RSI_{p}"] = RSIIndicator(c, window=p).rsi() + + macd = MACD(c, window_slow=26, window_fast=12, window_sign=9) + df["MACD_line"] = macd.macd() + df["MACD_signal"] = macd.macd_signal() + df["MACD_hist"] = macd.macd_diff() + + stoch = StochasticOscillator(h, l, c, window=14, smooth_window=3) + df["stoch_k"] = stoch.stoch() + df["stoch_d"] = stoch.stoch_signal() + + df["williams_r"] = WilliamsRIndicator(h, l, c, lbp=14).williams_r() + df["ROC_10"] = ROCIndicator(c, window=10).roc() + df["CCI_20"] = CCIIndicator(h, l, c, window=20).cci() + + # --- Volatility --- + bb = BollingerBands(c, window=20, window_dev=2) + df["BB_upper"] = bb.bollinger_hband() + df["BB_lower"] = bb.bollinger_lband() + df["BB_width"] = (df["BB_upper"] - df["BB_lower"]) / c + df["BB_pctb"] = bb.bollinger_pband() + + df["ATR_14"] = AverageTrueRange(h, l, c, window=14).average_true_range() + df["ATR_pct"] = df["ATR_14"] / c + + kc = KeltnerChannel(h, l, c, window=20) + df["keltner_upper"] = kc.keltner_channel_hband() + df["keltner_lower"] = kc.keltner_channel_lband() + + df["hist_volatility"] = c.pct_change().rolling(20).std() * np.sqrt(252) + + # --- Volume --- + if feat.get("use_volume_features", True): + df["OBV"] = OnBalanceVolumeIndicator(c, v).on_balance_volume() + df["volume_sma_20"] = v.rolling(20).mean() + df["volume_ratio"] = v / df["volume_sma_20"] + df["volume_momentum"] = v.pct_change(5) + # VWAP approximation (rolling) + tp = (h + l + c) / 3 + df["vwap_approx"] = (tp * v).rolling(20).sum() / v.rolling(20).sum() + df["price_vs_vwap"] = c / df["vwap_approx"] - 1 + + # --- Candle Patterns --- + if feat.get("use_candle_patterns", True): + body = (c - o).abs() + full_range = h - l + df["candle_body_ratio"] = body / full_range.replace(0, np.nan) + df["upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / full_range.replace(0, np.nan) + df["lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / full_range.replace(0, np.nan) + df["is_bullish"] = (c > o).astype(int) + # Consecutive up/down + df["consecutive_up"] = df["is_bullish"].groupby((df["is_bullish"] != df["is_bullish"].shift()).cumsum()).cumsum() + df["consecutive_down"] = (1 - df["is_bullish"]).groupby(((1 - df["is_bullish"]) != (1 - df["is_bullish"]).shift()).cumsum()).cumsum() + + # --- Lag Features --- + if feat.get("use_lag_features", True): + lag_periods = feat.get("lag_periods", [1, 2, 3, 5]) + for lag in lag_periods: + df[f"return_lag_{lag}"] = c.pct_change(lag) + df[f"volume_lag_{lag}"] = v.pct_change(lag) + if f"RSI_14" in df.columns: + df[f"RSI_14_lag_{lag}"] = df["RSI_14"].shift(lag) + + # --- Lookback period features --- + for p in feat.get("lookback_periods", [3, 5, 10, 20]): + df[f"return_{p}"] = c.pct_change(p) + df[f"volatility_{p}"] = c.pct_change().rolling(p).std() + df[f"high_low_range_{p}"] = (h.rolling(p).max() - l.rolling(p).min()) / c + + return df + + +# --------------------------------------------------------------------------- +# Target Labeling +# --------------------------------------------------------------------------- + +def create_target(df: pd.DataFrame, config: dict) -> pd.Series: + """Create binary target: will price move >= threshold% within horizon?""" + tgt = config.get("target", {}) + horizon = tgt.get("horizon_candles", 6) + threshold = tgt.get("threshold_pct", 1.0) / 100.0 + direction = tgt.get("direction", "long") + + future_max = df["close"].shift(-1).rolling(horizon).max().shift(-horizon + 1) + future_min = df["close"].shift(-1).rolling(horizon).min().shift(-horizon + 1) + + if direction == "long": + target = ((future_max / df["close"]) - 1 >= threshold).astype(int) + elif direction == "short": + target = ((df["close"] / future_min) - 1 >= threshold).astype(int) + else: # both + long_signal = ((future_max / df["close"]) - 1 >= threshold).astype(int) + short_signal = ((df["close"] / future_min) - 1 >= threshold).astype(int) + target = long_signal # Simplify: use long for now + target[short_signal == 1] = 1 + + return target + + +# --------------------------------------------------------------------------- +# Model Building +# --------------------------------------------------------------------------- + +def build_model(config: dict): + """Build the ML model based on config.""" + model_type = config.get("model_type", "xgboost") + hp = config.get("hyperparameters", {}) + + if model_type == "xgboost": + import xgboost as xgb + # Detect GPU + try: + import torch + gpu_available = torch.cuda.is_available() + except ImportError: + gpu_available = False + + params = { + "learning_rate": hp.get("learning_rate", 0.05), + "max_depth": hp.get("max_depth", 6), + "n_estimators": hp.get("n_estimators", 500), + "subsample": hp.get("subsample", 0.8), + "colsample_bytree": hp.get("colsample_bytree", 0.8), + "min_child_weight": hp.get("min_child_weight", 5), + "gamma": hp.get("gamma", 0.1), + "reg_alpha": hp.get("reg_alpha", 0.1), + "reg_lambda": hp.get("reg_lambda", 1.0), + "eval_metric": "logloss", + "random_state": 42, + "device": "cuda" if gpu_available else "cpu", + "verbosity": 0, + } + return xgb.XGBClassifier(**params) + + elif model_type == "lightgbm": + import lightgbm as lgb + params = { + "learning_rate": hp.get("learning_rate", 0.05), + "max_depth": hp.get("max_depth", 6), + "n_estimators": hp.get("n_estimators", 500), + "subsample": hp.get("subsample", 0.8), + "colsample_bytree": hp.get("colsample_bytree", 0.8), + "min_child_samples": hp.get("min_child_weight", 5), + "reg_alpha": hp.get("reg_alpha", 0.1), + "reg_lambda": hp.get("reg_lambda", 1.0), + "random_state": 42, + "verbose": -1, + } + try: + params["device"] = "gpu" + model = lgb.LGBMClassifier(**params) + return model + except Exception: + params["device"] = "cpu" + return lgb.LGBMClassifier(**params) + + elif model_type == "catboost": + from catboost import CatBoostClassifier + try: + import torch + gpu_available = torch.cuda.is_available() + except ImportError: + gpu_available = False + + params = { + "learning_rate": hp.get("learning_rate", 0.05), + "depth": hp.get("max_depth", 6), + "iterations": hp.get("n_estimators", 500), + "subsample": hp.get("subsample", 0.8), + "l2_leaf_reg": hp.get("reg_lambda", 1.0), + "random_seed": 42, + "verbose": 0, + "task_type": "GPU" if gpu_available else "CPU", + } + return CatBoostClassifier(**params) + + elif model_type == "ensemble": + from sklearn.ensemble import VotingClassifier + models = [] + for sub_type in ["xgboost", "lightgbm", "catboost"]: + sub_config = {**config, "model_type": sub_type} + m = build_model(sub_config) + models.append((sub_type, m)) + return VotingClassifier(estimators=models, voting="soft") + + else: + raise ValueError(f"Unknown model_type: {model_type}") + + +# --------------------------------------------------------------------------- +# Walk-Forward Validation + Backtesting +# --------------------------------------------------------------------------- + +def walk_forward_train_test(df: pd.DataFrame, feature_cols: list, config: dict) -> dict: + """Walk-forward validation with backtesting on each window.""" + training_cfg = config.get("training", {}) + n_windows = training_cfg.get("walk_forward_windows", 5) + train_pct = training_cfg.get("train_pct", 0.7) + val_pct = training_cfg.get("validation_pct", 0.15) + + n = len(df) + window_size = n // n_windows + strategy = config.get("strategy", {}) + + all_trades = [] + per_window_sharpe = [] + feature_importances_sum = np.zeros(len(feature_cols)) + fi_count = 0 + + for w in range(n_windows): + start = w * window_size + end = min((w + 1) * window_size + int(window_size * 0.3), n) # overlap for test + if end > n: + end = n + + window_data = df.iloc[start:end].copy() + wn = len(window_data) + + train_end = int(wn * train_pct) + val_end = int(wn * (train_pct + val_pct)) + + train_df = window_data.iloc[:train_end] + val_df = window_data.iloc[train_end:val_end] + test_df = window_data.iloc[val_end:] + + if len(test_df) < 10 or train_df["target"].nunique() < 2: + continue + + X_train = train_df[feature_cols].values + y_train = train_df["target"].values + X_val = val_df[feature_cols].values + y_val = val_df["target"].values + X_test = test_df[feature_cols].values + + # Train model + model = build_model(config) + try: + model.fit(X_train, y_train) + except Exception as e: + print(f" Window {w+1}: training failed — {e}", file=sys.stderr) + continue + + # Get predictions on test set + try: + proba = model.predict_proba(X_test)[:, 1] + except Exception: + preds = model.predict(X_test) + proba = preds.astype(float) + + # Extract feature importances + try: + if hasattr(model, "feature_importances_"): + fi = model.feature_importances_ + elif hasattr(model, "get_booster"): + fi_dict = model.get_booster().get_score(importance_type="gain") + fi = np.array([fi_dict.get(f"f{i}", 0) for i in range(len(feature_cols))]) + else: + fi = np.zeros(len(feature_cols)) + feature_importances_sum += fi / (fi.sum() + 1e-10) + fi_count += 1 + except Exception: + pass + + # Backtest on test set + trades = backtest(test_df, proba, strategy) + all_trades.extend(trades) + + # Window sharpe + if trades: + returns = [t["return_pct"] for t in trades] + mean_r = np.mean(returns) + std_r = np.std(returns) if len(returns) > 1 else 1.0 + sharpe = (mean_r / std_r) * np.sqrt(252 / max(1, len(trades))) if std_r > 0 else 0 + per_window_sharpe.append(round(sharpe, 3)) + else: + per_window_sharpe.append(0.0) + + print(f" Window {w+1}/{n_windows}: {len(trades)} trades, sharpe={per_window_sharpe[-1]}") + + return compile_results(all_trades, per_window_sharpe, feature_importances_sum, fi_count, feature_cols, df) + + +def backtest(test_df: pd.DataFrame, proba: np.ndarray, strategy: dict) -> list: + """Simulate trades using model predictions.""" + entry_threshold = strategy.get("entry_threshold", 0.6) + stop_loss = strategy.get("stop_loss_pct", 2.0) / 100 + take_profit = strategy.get("take_profit_pct", 4.0) / 100 + trailing_stop = strategy.get("trailing_stop_pct", 1.5) / 100 + exit_type = strategy.get("exit_type", "trailing_stop") + min_confidence = strategy.get("min_confidence_to_trade", 0.55) + fee = 0.001 # 0.1% per trade + + closes = test_df["close"].values + highs = test_df["high"].values + lows = test_df["low"].values + trades = [] + i = 0 + + while i < len(closes) - 1: + if proba[i] < min_confidence or proba[i] < entry_threshold: + i += 1 + continue + + # Enter trade + entry_price = closes[i] + confidence = proba[i] + # Position sizing based on confidence + if strategy.get("position_sizing") == "confidence_scaled": + if confidence > 0.8: + size_mult = 1.0 + elif confidence > 0.65: + size_mult = 0.75 + else: + size_mult = 0.5 + else: + size_mult = 1.0 + + peak = entry_price + j = i + 1 + + while j < len(closes): + current_high = highs[j] + current_low = lows[j] + current_close = closes[j] + peak = max(peak, current_high) + + # Check stop loss + if (entry_price - current_low) / entry_price >= stop_loss: + exit_price = entry_price * (1 - stop_loss) + break + # Check take profit + if (current_high - entry_price) / entry_price >= take_profit: + exit_price = entry_price * (1 + take_profit) + break + # Check trailing stop + if exit_type == "trailing_stop" and (peak - current_low) / peak >= trailing_stop: + exit_price = peak * (1 - trailing_stop) + break + + j += 1 + else: + # Exit at end of test period + exit_price = closes[-1] + + raw_return = (exit_price - entry_price) / entry_price + net_return = raw_return - 2 * fee # entry + exit fees + net_return *= size_mult + + trades.append({ + "entry_idx": i, + "exit_idx": j if j < len(closes) else len(closes) - 1, + "entry_price": float(entry_price), + "exit_price": float(exit_price), + "return_pct": float(net_return * 100), + "confidence": float(confidence), + "size_mult": float(size_mult), + "duration": j - i, + }) + + i = j + 1 # Skip to after exit + + return trades + + +def compile_results(trades: list, per_window_sharpe: list, + fi_sum: np.ndarray, fi_count: int, + feature_cols: list, df: pd.DataFrame) -> dict: + """Compile all results into output JSON.""" + if not trades: + return { + "sharpe_ratio": 0.0, + "total_return_pct": 0.0, + "max_drawdown_pct": 0.0, + "win_rate": 0.0, + "trade_count": 0, + "profit_factor": 0.0, + "avg_trade_duration_candles": 0.0, + "feature_importances": {}, + "monthly_returns": [], + "equity_curve": [], + "per_window_sharpe": per_window_sharpe, + } + + returns = [t["return_pct"] for t in trades] + wins = [r for r in returns if r > 0] + losses = [r for r in returns if r <= 0] + + total_return = 1.0 + equity = [1.0] + for r in returns: + total_return *= (1 + r / 100) + equity.append(total_return) + + # Max drawdown + peak_eq = equity[0] + max_dd = 0 + for eq in equity: + peak_eq = max(peak_eq, eq) + dd = (eq - peak_eq) / peak_eq + max_dd = min(max_dd, dd) + + # Sharpe (annualized approximation) + mean_r = np.mean(returns) + std_r = np.std(returns) if len(returns) > 1 else 1.0 + trades_per_year = 252 # approximate + sharpe = (mean_r / std_r) * np.sqrt(trades_per_year / max(1, len(returns))) if std_r > 0 else 0 + + # Profit factor + gross_profit = sum(wins) if wins else 0 + gross_loss = abs(sum(losses)) if losses else 1 + profit_factor = gross_profit / gross_loss if gross_loss > 0 else gross_profit + + # Feature importances + fi_avg = fi_sum / max(fi_count, 1) + fi_sorted = sorted(zip(feature_cols, fi_avg), key=lambda x: -x[1]) + feature_importances = {name: round(float(val), 4) for name, val in fi_sorted[:30]} + + # Monthly returns (approximate by grouping trades) + monthly_returns = [] + trades_per_month = max(1, len(trades) // 12) + for i in range(0, len(returns), trades_per_month): + chunk = returns[i:i + trades_per_month] + monthly_returns.append(round(sum(chunk), 2)) + + # Sample equity curve to ~100 points + if len(equity) > 100: + step = len(equity) // 100 + equity_sampled = [round(equity[i], 4) for i in range(0, len(equity), step)] + else: + equity_sampled = [round(e, 4) for e in equity] + + return { + "sharpe_ratio": round(sharpe, 3), + "total_return_pct": round((total_return - 1) * 100, 2), + "max_drawdown_pct": round(max_dd * 100, 2), + "win_rate": round(len(wins) / len(returns), 3) if returns else 0, + "trade_count": len(trades), + "profit_factor": round(profit_factor, 3), + "avg_trade_duration_candles": round(np.mean([t["duration"] for t in trades]), 1), + "feature_importances": feature_importances, + "monthly_returns": monthly_returns, + "equity_curve": equity_sampled, + "per_window_sharpe": per_window_sharpe, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="BTC ML Trading — Train & Backtest") + parser.add_argument("--config", required=True, help="Path to config JSON") + parser.add_argument("--data", required=True, help="Path to OHLCV CSV") + parser.add_argument("--output", required=True, help="Path to output results JSON") + args = parser.parse_args() + + # Load config + with open(args.config) as f: + config = json.load(f) + + # Load data + print(f"Loading data from {args.data}...") + df = pd.read_csv(args.data, parse_dates=["timestamp"]) + print(f" {len(df)} rows, {df['timestamp'].iloc[0]} → {df['timestamp'].iloc[-1]}") + + # Compute features + print("Computing features...") + df = compute_features(df, config) + + # Create target + print("Creating target labels...") + df["target"] = create_target(df, config) + + # Drop NaN rows + feature_cols = [c for c in df.columns if c not in ["timestamp", "open", "high", "low", "close", "volume", "target"]] + df = df.dropna(subset=feature_cols + ["target"]).reset_index(drop=True) + print(f" {len(df)} rows after dropping NaN, {len(feature_cols)} features") + print(f" Target distribution: {df['target'].value_counts().to_dict()}") + + # Run walk-forward training + backtesting + print("\nRunning walk-forward validation...") + results = walk_forward_train_test(df, feature_cols, config) + + # Save results + with open(args.output, "w") as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {args.output}") + print(f" Sharpe: {results['sharpe_ratio']}, Return: {results['total_return_pct']}%, " + f"Win Rate: {results['win_rate']}, Trades: {results['trade_count']}") + + +if __name__ == "__main__": + main() diff --git a/orchestrator.py b/orchestrator.py new file mode 100755 index 0000000..9d9254d --- /dev/null +++ b/orchestrator.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +""" +BTC ML Trading Strategy Optimizer — Orchestrator +Coordinates the optimization loop across VPS, Windows PC (GPU), and Mac Mini (LLM). +""" + +import json +import os +import subprocess +import sys +import time +from datetime import datetime, timezone + +# Paths +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(BASE_DIR, "data") +CONFIG_DIR = os.path.join(BASE_DIR, "config") +RESULTS_DIR = os.path.join(BASE_DIR, "results") +ITERATIONS_LOG = os.path.join(RESULTS_DIR, "iterations.jsonl") + +# Remote machines +WINDOWS_HOST = "bizzle@100.76.218.38" +WINDOWS_DIR = "~/btc-ml-optimizer" +MAC_MINI_HOST = "bizzle@bizzles-mac-mini-1" + +# Convergence +MAX_ITERATIONS = 50 +CONVERGENCE_WINDOW = 5 +CONVERGENCE_THRESHOLD = 0.01 # 1% improvement +TARGET_SHARPE = 3.0 +ML_TIMEOUT = 600 # 10 minutes + +# Colors +class C: + BOLD = "\033[1m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + CYAN = "\033[96m" + MAGENTA = "\033[95m" + DIM = "\033[2m" + RESET = "\033[0m" + + +def log(msg, color=""): + ts = datetime.now(timezone.utc).strftime("%H:%M:%S") + print(f"{C.DIM}[{ts}]{C.RESET} {color}{msg}{C.RESET}") + + +def run_cmd(cmd, timeout=120, check=True): + """Run a shell command and return stdout.""" + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=timeout + ) + if check and result.returncode != 0: + raise RuntimeError(f"Command failed: {cmd}\n{result.stderr}") + return result.stdout.strip() + + +def ensure_data(): + """Make sure BTC data is fetched.""" + data_4h = os.path.join(DATA_DIR, "btc_4h.csv") + data_1h = os.path.join(DATA_DIR, "btc_1h.csv") + if os.path.exists(data_4h) and os.path.exists(data_1h): + log("Data files already exist", C.GREEN) + return + log("Fetching BTC data...", C.YELLOW) + run_cmd(f"cd {BASE_DIR} && python3 scripts/fetch_data.py", timeout=300) + + +def setup_windows_remote(): + """Ensure the remote directory exists on Windows.""" + log("Ensuring Windows remote directory exists...", C.CYAN) + run_cmd(f'ssh {WINDOWS_HOST} "mkdir -p {WINDOWS_DIR}"', timeout=30) + + +def scp_to_windows(local_path, remote_name): + """SCP a file to the Windows PC.""" + run_cmd(f"scp -q {local_path} {WINDOWS_HOST}:{WINDOWS_DIR}/{remote_name}", timeout=60) + + +def scp_from_windows(remote_name, local_path): + """SCP a file from the Windows PC.""" + run_cmd(f"scp -q {WINDOWS_HOST}:{WINDOWS_DIR}/{remote_name} {local_path}", timeout=60) + + +def run_ml_training(): + """Run the ML engine on the Windows PC via SSH.""" + cmd = ( + f'ssh {WINDOWS_HOST} ' + f'"cd {WINDOWS_DIR} && python train_and_backtest.py ' + f'--config config.json --data btc_4h.csv --output results.json"' + ) + log("Running ML training on Windows PC (GPU)...", C.MAGENTA) + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=ML_TIMEOUT + ) + if result.returncode != 0: + raise RuntimeError(f"ML training failed:\n{result.stderr}\n{result.stdout}") + # Print training output + for line in result.stdout.strip().split("\n"): + log(f" {C.DIM}{line}", C.DIM) + return True + + +def load_iteration_history(): + """Load iteration history from JSONL log.""" + history = [] + if os.path.exists(ITERATIONS_LOG): + with open(ITERATIONS_LOG) as f: + for line in f: + line = line.strip() + if line: + history.append(json.loads(line)) + return history + + +def save_iteration(iteration_data): + """Append an iteration to the JSONL log.""" + with open(ITERATIONS_LOG, "a") as f: + f.write(json.dumps(iteration_data) + "\n") + + +def check_convergence(history): + """Check if optimization has converged.""" + if len(history) < CONVERGENCE_WINDOW + 1: + return False, "Not enough iterations" + + recent = history[-CONVERGENCE_WINDOW:] + sharpes = [h["sharpe"] for h in recent] + + # Check if best sharpe exceeds target + best_sharpe = max(h["sharpe"] for h in history) + if best_sharpe >= TARGET_SHARPE: + return True, f"Target Sharpe reached: {best_sharpe:.3f}" + + # Check if improvement has stalled + best_recent = max(sharpes) + worst_recent = min(sharpes) + if best_recent > 0 and (best_recent - worst_recent) / best_recent < CONVERGENCE_THRESHOLD: + return True, f"Converged: Sharpe variance < {CONVERGENCE_THRESHOLD*100}% over {CONVERGENCE_WINDOW} iterations" + + return False, "" + + +def print_header(): + print(f""" +{C.BOLD}{C.CYAN}╔══════════════════════════════════════════════════╗ +║ BTC ML Trading Strategy Optimizer ║ +║ VPS → Windows GPU → Mac Mini LLM → Loop ║ +╚══════════════════════════════════════════════════╝{C.RESET} +""") + + +def print_results(results, iteration): + sharpe = results.get("sharpe_ratio", 0) + sharpe_color = C.GREEN if sharpe > 1.5 else C.YELLOW if sharpe > 1.0 else C.RED + print(f""" +{C.BOLD}━━━ Iteration {iteration} Results ━━━{C.RESET} + Sharpe Ratio: {sharpe_color}{C.BOLD}{sharpe:.3f}{C.RESET} + Total Return: {results.get('total_return_pct', 0):.1f}% + Max Drawdown: {results.get('max_drawdown_pct', 0):.1f}% + Win Rate: {results.get('win_rate', 0):.1%} + Trade Count: {results.get('trade_count', 0)} + Profit Factor: {results.get('profit_factor', 0):.3f} + Avg Duration: {results.get('avg_trade_duration_candles', 0):.1f} candles + Window Sharpes: {results.get('per_window_sharpe', [])} +""") + + +def main(): + print_header() + os.makedirs(RESULTS_DIR, exist_ok=True) + + # Step 1: Ensure data + ensure_data() + + # Step 2: Load or create initial config + config_path = os.path.join(CONFIG_DIR, "initial_config.json") + best_config_path = os.path.join(CONFIG_DIR, "best_config.json") + + # Resume from best config if it exists + if os.path.exists(best_config_path): + log("Resuming from best_config.json", C.GREEN) + with open(best_config_path) as f: + config = json.load(f) + else: + with open(config_path) as f: + config = json.load(f) + + history = load_iteration_history() + start_iter = len(history) + 1 + best_sharpe = max((h["sharpe"] for h in history), default=0) + + log(f"Starting at iteration {start_iter}, best Sharpe so far: {best_sharpe:.3f}", C.BOLD) + + # Step 3: Setup Windows remote + setup_windows_remote() + + # SCP the ML engine script (once) + log("Uploading ML engine to Windows...", C.CYAN) + scp_to_windows(os.path.join(BASE_DIR, "ml_engine", "train_and_backtest.py"), "train_and_backtest.py") + + # SCP data files (once) + for tf in ["1h", "4h"]: + data_file = os.path.join(DATA_DIR, f"btc_{tf}.csv") + if os.path.exists(data_file): + log(f"Uploading btc_{tf}.csv to Windows...", C.CYAN) + scp_to_windows(data_file, f"btc_{tf}.csv") + + # Import LLM analyzer + sys.path.insert(0, os.path.join(BASE_DIR, "llm_client")) + from analyzer import analyze_and_suggest + + # Main optimization loop + for iteration in range(start_iter, MAX_ITERATIONS + 1): + log(f"\n{'='*50}", C.BOLD) + log(f"ITERATION {iteration}/{MAX_ITERATIONS}", f"{C.BOLD}{C.CYAN}") + log(f"Model: {config.get('model_type', 'unknown')}, " + f"LR: {config.get('hyperparameters', {}).get('learning_rate', '?')}, " + f"Depth: {config.get('hyperparameters', {}).get('max_depth', '?')}", C.DIM) + log(f"{'='*50}", C.BOLD) + + # Write current config to temp file and SCP + tmp_config = os.path.join(BASE_DIR, "config", "current_config.json") + with open(tmp_config, "w") as f: + json.dump(config, f, indent=2) + scp_to_windows(tmp_config, "config.json") + + # Run ML training on Windows + try: + run_ml_training() + except (RuntimeError, subprocess.TimeoutExpired) as e: + log(f"ML training failed: {e}", C.RED) + log("Reverting to previous config and continuing...", C.YELLOW) + if history: + config = history[-1].get("config", config) + continue + + # Fetch results from Windows + results_local = os.path.join(RESULTS_DIR, f"results_iter_{iteration}.json") + scp_from_windows("results.json", results_local) + + with open(results_local) as f: + results = json.load(f) + + print_results(results, iteration) + + # Track best + current_sharpe = results.get("sharpe_ratio", 0) + is_best = current_sharpe > best_sharpe + + if is_best: + best_sharpe = current_sharpe + with open(best_config_path, "w") as f: + json.dump(config, f, indent=2) + log(f"NEW BEST! Sharpe: {best_sharpe:.3f}", f"{C.BOLD}{C.GREEN}") + + # Log iteration + iter_data = { + "iteration": iteration, + "timestamp": datetime.now(timezone.utc).isoformat(), + "sharpe": current_sharpe, + "return": results.get("total_return_pct", 0), + "max_drawdown": results.get("max_drawdown_pct", 0), + "win_rate": results.get("win_rate", 0), + "trades": results.get("trade_count", 0), + "profit_factor": results.get("profit_factor", 0), + "model_type": config.get("model_type", "unknown"), + "is_best": is_best, + "config": config, + } + save_iteration(iter_data) + history.append(iter_data) + + # Check convergence + converged, reason = check_convergence(history) + if converged: + log(f"\nOptimization converged: {reason}", f"{C.BOLD}{C.GREEN}") + break + + if iteration >= MAX_ITERATIONS: + log(f"\nMax iterations ({MAX_ITERATIONS}) reached.", C.YELLOW) + break + + # Ask LLM for next config + log("\nConsulting LLM for strategy modifications...", C.MAGENTA) + try: + summary_history = [ + { + "iteration": h["iteration"], + "sharpe": h["sharpe"], + "return": h["return"], + "win_rate": h["win_rate"], + "trades": h["trades"], + "model_type": h["model_type"], + } + for h in history + ] + new_config, reasoning = analyze_and_suggest(config, results, summary_history) + log(f" LLM reasoning: {reasoning[:200]}...", C.DIM) + config = new_config + except Exception as e: + log(f"LLM call failed: {e}", C.RED) + log("Continuing with current config + random perturbation...", C.YELLOW) + # Small random perturbation as fallback + import random + hp = config.get("hyperparameters", {}) + hp["learning_rate"] = hp.get("learning_rate", 0.05) * random.uniform(0.8, 1.2) + hp["max_depth"] = max(3, min(10, hp.get("max_depth", 6) + random.choice([-1, 0, 1]))) + config["hyperparameters"] = hp + + # Final summary + print(f""" +{C.BOLD}{C.GREEN}╔══════════════════════════════════════════════════╗ +║ Optimization Complete! ║ +╚══════════════════════════════════════════════════╝{C.RESET} + + Total Iterations: {len(history)} + Best Sharpe: {C.BOLD}{best_sharpe:.3f}{C.RESET} + Best Config: {best_config_path} + Iteration Log: {ITERATIONS_LOG} +""") + + +if __name__ == "__main__": + main() diff --git a/requirements_vps.txt b/requirements_vps.txt new file mode 100644 index 0000000..46db4ce --- /dev/null +++ b/requirements_vps.txt @@ -0,0 +1,2 @@ +ccxt +requests diff --git a/requirements_windows.txt b/requirements_windows.txt new file mode 100644 index 0000000..90ec99f --- /dev/null +++ b/requirements_windows.txt @@ -0,0 +1,9 @@ +torch --index-url https://download.pytorch.org/whl/cu128 +xgboost +lightgbm +catboost +optuna +pandas +numpy +ta +scikit-learn diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py new file mode 100755 index 0000000..37f5920 --- /dev/null +++ b/scripts/fetch_data.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +"""Fetch BTC/USDT OHLCV data from Binance using ccxt.""" + +import os +import sys +import time +import ccxt +import pandas as pd +from datetime import datetime, timezone + +DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data") +SYMBOL = "BTC/USDT" +EXCHANGE_ID = "binance" +YEARS_HISTORY = 2 +LIMIT_PER_REQUEST = 1000 # Binance max + + +def fetch_ohlcv(timeframe: str) -> pd.DataFrame: + """Fetch OHLCV data for a given timeframe.""" + exchange = ccxt.binance({"enableRateLimit": True}) + + # Calculate start time + now_ms = int(time.time() * 1000) + if timeframe == "1h": + ms_per_candle = 3600 * 1000 + elif timeframe == "4h": + ms_per_candle = 4 * 3600 * 1000 + else: + raise ValueError(f"Unsupported timeframe: {timeframe}") + + since = now_ms - (YEARS_HISTORY * 365 * 24 * 3600 * 1000) + + all_candles = [] + print(f" Fetching {SYMBOL} {timeframe} from {datetime.fromtimestamp(since / 1000, tz=timezone.utc).strftime('%Y-%m-%d')}...") + + while since < now_ms: + try: + candles = exchange.fetch_ohlcv(SYMBOL, timeframe, since=since, limit=LIMIT_PER_REQUEST) + except Exception as e: + print(f" Warning: fetch error, retrying in 5s — {e}") + time.sleep(5) + continue + + if not candles: + break + + all_candles.extend(candles) + since = candles[-1][0] + ms_per_candle + sys.stdout.write(f"\r Downloaded {len(all_candles)} candles...") + sys.stdout.flush() + time.sleep(exchange.rateLimit / 1000) + + print(f"\r Downloaded {len(all_candles)} candles total.") + + df = pd.DataFrame(all_candles, columns=["timestamp", "open", "high", "low", "close", "volume"]) + df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True) + df = df.drop_duplicates(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True) + return df + + +def main(): + os.makedirs(DATA_DIR, exist_ok=True) + + for tf in ["1h", "4h"]: + print(f"\n[*] Fetching {tf} data...") + df = fetch_ohlcv(tf) + out_path = os.path.join(DATA_DIR, f"btc_{tf}.csv") + df.to_csv(out_path, index=False) + print(f" Saved {len(df)} rows to {out_path}") + print(f" Range: {df['timestamp'].iloc[0]} → {df['timestamp'].iloc[-1]}") + + print("\nData fetch complete!") + + +if __name__ == "__main__": + main() diff --git a/scripts/setup_windows.sh b/scripts/setup_windows.sh new file mode 100755 index 0000000..2f72e8b --- /dev/null +++ b/scripts/setup_windows.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Setup Windows PC (100.76.218.38) with ML dependencies for BTC optimizer +set -euo pipefail + +WINDOWS_HOST="bizzle@100.76.218.38" +REMOTE_DIR="~/btc-ml-optimizer" + +echo "=== BTC ML Optimizer — Windows PC Setup ===" +echo "Target: $WINDOWS_HOST" +echo "" + +# Create project directory on Windows +echo "[1/3] Creating project directory..." +ssh "$WINDOWS_HOST" "mkdir -p $REMOTE_DIR" + +# Install PyTorch with CUDA support + ML libraries +echo "[2/3] Installing Python dependencies (this may take a while)..." +ssh "$WINDOWS_HOST" "pip install --upgrade pip && \ + pip install torch --index-url https://download.pytorch.org/whl/cu128 && \ + pip install xgboost lightgbm catboost optuna && \ + pip install pandas numpy scikit-learn ta" + +# Verify installations +echo "[3/3] Verifying installations..." +ssh "$WINDOWS_HOST" "python -c \" +import torch +print(f'PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}') +if torch.cuda.is_available(): + print(f' GPU: {torch.cuda.get_device_name(0)}') +import xgboost; print(f'XGBoost {xgboost.__version__}') +import lightgbm; print(f'LightGBM {lightgbm.__version__}') +import catboost; print(f'CatBoost {catboost.__version__}') +import optuna; print(f'Optuna {optuna.__version__}') +import pandas; print(f'Pandas {pandas.__version__}') +import numpy; print(f'NumPy {numpy.__version__}') +import ta; print('ta library OK') +import sklearn; print(f'scikit-learn {sklearn.__version__}') +print('All dependencies verified!') +\"" + +echo "" +echo "=== Setup complete! ==="