Cost Estimator ↗
noOriginal Documentation
Documentation Index#
Fetch the complete documentation index at: https://docs.fireworks.ai/llms.txt Use this file to discover all available pages before exploring further.
Estimate and optimize the cost of your RFT training jobs
export const RftCostCalculator = () => { const MODEL_GPU_CONFIG = { “accounts/fireworks/models/kimi-k2-instruct-0905”: { gpus: 8, gpuType: “NVIDIA_B200_180GB” }, “accounts/fireworks/models/kimi-k2p5”: { gpus: 8, gpuType: “NVIDIA_B200_180GB” }, “accounts/fireworks/models/deepseek-v2-lite-chat”: { gpus: 8, gpuType: “NVIDIA_H200_141GB” }, “accounts/fireworks/models/deepseek-r1-0528”: { gpus: 8, gpuType: “NVIDIA_H200_141GB” }, “accounts/fireworks/models/deepseek-r1-distill-qwen-14b”: { gpus: 8, gpuType: “NVIDIA_H200_141GB” } }; const GPU_PRICING = { “NVIDIA_A100_80GB”: 2.9, “NVIDIA_H100_80GB”: 4.0, “NVIDIA_H200_141GB”: 6.0, “NVIDIA_B200_180GB”: 9.0 }; const COMMON_MODELS = [“accounts/fireworks/models/qwen3-0p6b”, “accounts/fireworks/models/qwen2p5-coder-14b-instruct”, “accounts/fireworks/models/kimi-k2-instruct-0905”, “accounts/fireworks/models/kimi-k2p5”, “accounts/fireworks/models/qwen3-235b-a22b-instruct-2507”, “accounts/fireworks/models/qwen3-coder-30b-a3b-instruct”, “accounts/fireworks/models/qwen3-32b”, “accounts/fireworks/models/gpt-oss-20b”, “accounts/fireworks/models/llama-v3p1-8b-instruct”, “accounts/fireworks/models/gpt-oss-120b”, “accounts/fireworks/models/qwen3-8b”, “accounts/fireworks/models/deepseek-v2-lite-chat”, “accounts/fireworks/models/deepseek-r1-0528”, “accounts/fireworks/models/qwen3p5-9b”, “accounts/fireworks/models/qwen3p5-27b”, “accounts/fireworks/models/qwen3p5-35b-a3b”, “accounts/fireworks/models/qwen3p5-122b-a10b”]; const getGpuConfig = modelName => { return MODEL_GPU_CONFIG[modelName] || ({ gpus: 4, gpuType: “NVIDIA_H200_141GB” }); }; const parseParameterCount = modelName => { const lower = modelName.toLowerCase(); if (lower.includes(“k2”) || lower.includes(“k2p5”)) { return 1024_000_000_000; } if (lower.includes(“deepseek-r1-0528”)) { return 671_000_000_000; } if (lower.includes(“deepseek-v3”)) { return 671_000_000_000; } if (lower.includes(“deepseek-v2-lite”)) { return 15_700_000_000; } if (lower.includes(“deepseek-v2”)) { return 236_000_000_000; } const decimalMatch = modelName.match(/-(\d+)p(\d+)([bm])(?:-|$)/i); if (decimalMatch) { const whole = parseInt(decimalMatch[1], 10); const decimal = parseInt(decimalMatch[2], 10); const unit = decimalMatch[3].toLowerCase(); const value = whole + decimal / Math.pow(10, decimalMatch[2].length); if (unit === “b”) { return value * 1_000_000_000; } if (unit === “m”) { return value * 1_000_000; } } const integerMatch = modelName.match(/-(\d+)([bm])(?:-|$)/i); if (integerMatch) { const whole = parseInt(integerMatch[1], 10); const unit = integerMatch[2].toLowerCase(); if (unit === “b”) { return whole * 1_000_000_000; } if (unit === “m”) { return whole * 1_000_000; } } return 0; }; const models = COMMON_MODELS.map(modelName => { const gpuConfig = getGpuConfig(modelName); const params = parseParameterCount(modelName); const free = params > 0 && params < 16_000_000_000; return { baseModel: modelName, gpus: gpuConfig.gpus, gpuType: gpuConfig.gpuType, free }; }); const [modelIdx, setModelIdx] = useState(0); const [prompts, setPrompts] = useState(500); const [epochs, setEpochs] = useState(1); const [customEpochs, setCustomEpochs] = useState(""); const [isCustomEpochs, setIsCustomEpochs] = useState(false); const [rollouts, setRollouts] = useState(4); const [customRollouts, setCustomRollouts] = useState(""); const [isCustomRollouts, setIsCustomRollouts] = useState(false); const [maxTokens, setMaxTokens] = useState(2048); const [customMaxTokens, setCustomMaxTokens] = useState(""); const [isCustomMaxTokens, setIsCustomMaxTokens] = useState(false); const model = models[modelIdx]; const gpuRate = GPU_PRICING[model.gpuType] || 6.0; const effectiveEpochs = isCustomEpochs && customEpochs ? Number(customEpochs) || 1 : epochs; const effectiveRollouts = isCustomRollouts && customRollouts ? Number(customRollouts) || 4 : rollouts; const effectiveMaxTokens = isCustomMaxTokens && customMaxTokens ? Number(customMaxTokens) || 2048 : maxTokens; const avgTokens = Math.round(effectiveMaxTokens * 0.6); const totalRolloutTokens = prompts * effectiveEpochs * effectiveRollouts * avgTokens; const totalTokensMillion = totalRolloutTokens / 1_000_000; const getEfficiencyRangeByModelSize = params => { if (params === 0) { return { min: 1.0, max: 8.0 }; } if (params < 16_000_000_000) { return { min: 0.4, max: 6.0 }; } if (params < 70_000_000_000) { return { min: 0.3, max: 6.4 }; } if (params < 300_000_000_000) { return { min: 0.3, max: 14.1 }; } return { min: 4.0, max: 15.0 }; }; const modelParams = parseParameterCount(model.baseModel); const efficiencyRange = getEfficiencyRangeByModelSize(modelParams); const gpuHoursPerMtokMin = efficiencyRange.min; const gpuHoursPerMtokMax = efficiencyRange.max; const totalGpuHoursMin = totalTokensMillion * gpuHoursPerMtokMin; const totalGpuHoursMax = totalTokensMillion * gpuHoursPerMtokMax; const costMin = model.free ? 0 : totalGpuHoursMin * gpuRate; const costMax = model.free ? 0 : totalGpuHoursMax * gpuRate; const totalRolloutsCount = prompts * effectiveEpochs * effectiveRollouts; const inputClasses = “w-full px-3 py-2 rounded-lg text-sm border border-zinc-300 dark:border-zinc-500 bg-white dark:bg-zinc-800 text-zinc-900 dark:text-zinc-100 outline-none focus:ring-2 focus:ring-purple-500/40 dark:focus:ring-purple-500/60 focus:border-purple-500/50 dark:focus:border-purple-400 transition-colors”; const labelClasses = “block text-xs font-medium mb-1 text-zinc-500 dark:text-zinc-400”; return {}
{}
<label>Base Model</label>
<select value={modelIdx} onChange={e => setModelIdx(Number(e.target.value))}>
{models.map((m, i) => <option key={i} value={i}>
{m.baseModel}
</option>)}
</select>
{}
<label>Dataset prompts</label>
<input type="number" min={1} max={100000} value={prompts} onChange={e => setPrompts(Math.max(1, Number(e.target.value) || 1))} />
{}
<label>Epochs</label>
<select value={isCustomEpochs ? "other" : epochs} onChange={e => {
if (e.target.value === “other”) { setIsCustomEpochs(true); } else { setIsCustomEpochs(false); setEpochs(Number(e.target.value)); } }}> {[1, 2, 3, 4, 5].map(n => {isCustomEpochs && <input type=“number” min={1} value={customEpochs} onChange={e => setCustomEpochs(e.target.value)} placeholder=“Enter number of epochs” mt-2`} />}
{}
<label>Response candidates count (n)</label>
<select value={isCustomRollouts ? "other" : rollouts} onChange={e => {
if (e.target.value === “other”) { setIsCustomRollouts(true); } else { setIsCustomRollouts(false); setRollouts(Number(e.target.value)); } }}> {[2, 4, 6, 8].map(n => {isCustomRollouts && <input type=“number” min={1} value={customRollouts} onChange={e => setCustomRollouts(e.target.value)} placeholder=“Enter number of candidates” mt-2`} />}
{}
<label>Max tokens per rollout</label>
<select value={isCustomMaxTokens ? "other" : maxTokens} onChange={e => {
if (e.target.value === “other”) { setIsCustomMaxTokens(true); } else { setIsCustomMaxTokens(false); setMaxTokens(Number(e.target.value)); } }}> {[256, 512, 1024, 2048, 4096, 8192, 16384].map(n => {isCustomMaxTokens && <input type=“number” min={1} value={customMaxTokens} onChange={e => setCustomMaxTokens(e.target.value)} placeholder=“Enter max tokens” mt-2`} />}
{}
{}
Total Rollouts
{totalRolloutsCount.toLocaleString()}
{prompts.toLocaleString()} × {effectiveEpochs} × {effectiveRollouts}
{}
GPUs
{model.gpus}
{model.gpuType.replace("NVIDIA_", "").split("_")[0]}
{}
GPU Hours Range
{totalGpuHoursMin.toFixed(1)} - {totalGpuHoursMax.toFixed(1)}
{totalTokensMillion.toFixed(2)}M tokens × {gpuHoursPerMtokMin}-{gpuHoursPerMtokMax} GPU hrs/Mtok
{}
Estimated Cost Range
{model.free ? "Free" : `$${costMin.toFixed(2)} - $${costMax.toFixed(2)}`}
{model.free ? "Models under 16B" : `$${gpuRate}/GPU-hour`}
; };
Reinforcement Fine-Tuning (RFT) is free for models under 16B parameters. When creating an RFT job in the UI, filter for free tuning models in the model selection area on the fine-tuning creation page. If kicking off jobs from the terminal, you can find the model ID from the Model Library. Note: SFT and DPO jobs are billed per training token for all model sizes—see the pricing page for details.
Interactive cost calculator#
Select your model and training configuration to get an instant cost estimate. The calculator uses the following formulas:
- Total tokens: Prompts × Epochs × Response candidates × (Max tokens × 0.6)
- GPU hours: (Total tokens ÷ 1M) × (GPU hours per million tokens range, varies by model size)
- Cost: GPU hours × GPU rate per hour
You can derive wall-clock training time from the estimate as: Training time = GPU hours ÷ Number of GPUs.
The GPU hours per million tokens range varies by model size and accounts for variability in model efficiency, system overhead, and actual response lengths. Ranges are based on actual RFT job data.
Order-of-magnitude estimates only. This calculator provides estimates and is not intended for real forecasting or budgeting. Actual costs may vary significantly.
How RFT pricing works#
Reinforcement fine-tuning jobs are billed based on GPU-seconds consumed during training. The total cost depends on three main factors:
- Model size — Determines how many GPUs are needed and the per-GPU-hour rate
- Training dataset — How much data is processed (dataset size × epochs × rollouts)
- Rollout generation — Token generation during training (max tokens × rollouts per prompt)
Cost formula#
The approximate cost of an RFT job can be estimated as:
$$ \text{Cost} = \text{GPU-hours} \times \text{Price per GPU-hour} $$
Where GPU-hours depend on:
$$ \text{GPU-hours} \approx \text{Num GPUs} \times \left(\frac{\text{Prompts} \times \text{Epochs} \times \text{Rollouts (n)} \times \text{Avg tokens per rollout}}{\text{Throughput (tokens/sec)}}\right) \div 3600 $$
The key variables are:
| Variable | Description | How to control |
|---|---|---|
| Num GPUs | GPUs required for the model | Determined by model size |
| Prompts | Number of rows in your dataset | Your dataset size |
| Epochs | Passes through the dataset | --epochs flag (default: 1) |
| Response candidates (n) | Responses generated per prompt | --n flag (default: 4) |
| Avg tokens per rollout | Average response length | --max-tokens flag (default: 2048) |
| Throughput | Tokens generated per second | Determined by model + hardware |
Training time directly translates to cost: Cost = Training time × Num GPUs × GPU-hour rate. Check the pricing page for current GPU-hour rates.
How parameters affect cost#
See how each parameter change impacts your total cost relative to a baseline configuration (500 prompts, 1 epoch, n=4, 2048 max tokens):
| Change | Cost impact | Explanation |
|---|---|---|
| Double dataset size (1000 prompts) | ~2× | Linear scaling with dataset size |
| Double rollouts (n=8) | ~2× | Linear scaling with rollout count |
| Double max tokens (4096) | ~1.5–2× | More tokens per rollout |
| Add epoch (epochs=2) | ~2× | Full additional pass through data |
| Double LoRA rank (16 → 32) | ~1.2–1.5× | More trainable parameters |
| Halve max tokens (1024) | ~0.5–0.7× | Fewer tokens generated |
| Halve rollouts (n=2) | ~0.5× | Fewer rollouts but less learning signal |
Cost optimization tips#
This lets you:
Validate your evaluator logic at zero cost
Test dataset quality and format
Tune rollout parameters
Establish baseline reward curves
Set --max-tokens to the minimum needed for your task:
- Short outputs (classification, short answers): 256–512 tokens
- Medium outputs (code generation, summaries): 1024–2048 tokens
- Long outputs (detailed analysis, multi-step reasoning): 4096+ tokens
Every token generated during rollouts costs compute. Don’t use 16384 max tokens if your task only needs 512.
- Keep evaluations under 5 seconds per rollout
- Cache expensive computations
- For remote evaluators, ensure your server can handle concurrent requests
- Avoid unnecessary API calls in your evaluation logic
Evaluator complexity impact: Simple evaluators (self-contained) have minimal overhead. Evaluators with calls to external services, such as LLM-as-judge use cases or company-specific endpoints, may have variable training time due to rate limits by model providers or other services.
- Remove duplicate or near-duplicate prompts
- Ensure prompts are diverse and representative
- Start with 200–500 well-chosen prompts
- Quality over quantity reduces cost while maintaining performance
Example cost scenarios#
| Parameter | Value |
|---|---|
| Model | Qwen3 0.6B |
| Dataset | 100 prompts |
| Epochs | 1 |
| Rollouts (n) | 4 |
| Max tokens | 2048 |
| Estimated cost | Free |
| Estimated time | ~15–30 minutes |
Best for: Initial evaluator development and testing.
| Parameter | Value |
|---|---|
| Model | Llama 3.1 8B Instruct |
| Dataset | 500 prompts |
| Epochs | 1 |
| Rollouts (n) | 4 |
| Max tokens | 2048 |
| Estimated cost | Free |
| Estimated time | ~1–2 hours |
Best for: Production workloads that can use an 8B model.
| Parameter | Value |
|---|---|
| Model | Llama 3.3 70B Instruct |
| Dataset | 500 prompts |
| Epochs | 1 |
| Rollouts (n) | 4 |
| Max tokens | 2048 |
| Estimated cost | Training hours × 8 GPUs × rate |
| Estimated time | ~1–2 hours |
Check the Fireworks Pricing page for the current GPU-hour rate. For a 2-hour job on 8 GPUs, multiply: 2 × 8 × (rate per GPU-hour).
| Parameter | Value |
|---|---|
| Model | DeepSeek V3 |
| Dataset | 1000 prompts |
| Epochs | 2 |
| Rollouts (n) | 8 |
| Max tokens | 4096 |
| Estimated cost | Training hours × 8 GPUs × rate |
| Estimated time | ~8–16 hours |
This is a larger job. The cost scales with training time: more prompts, epochs, rollouts, and tokens all increase total GPU-hours.
Monitoring costs during training#
Cost information is only available after your job completes:
- Dashboard: The Fireworks Dashboard displays the final cost on the RFT job page once training finishes
- Training progress: While the job is running, you can monitor elapsed time and estimated completion in the job overview
- Early stopping: You can cancel a job early if needed—the model checkpoint from the last completed step is still usable. The final cost will be calculated based on GPU-seconds consumed up to the cancellation point.
If a job is running longer than expected, check your evaluator performance. Slow evaluators are the most common cause of unexpectedly long (and expensive) training runs.
Next steps#
View current GPU-hour rates and pricing tiers
Learn how each parameter affects training quality and cost
Create your first RFT job