Pick a sport frame, send it to NVIDIA Cosmos Reason 2, and see chain-of-thought physical reasoning in real time.
# Download model (~16GB)
huggingface-cli download nvidia/Cosmos-Reason2-8B \
--local-dir ./models/cosmos-reason2-8b
# Start the server
python cosmos_server.py --model-path ./models/cosmos-reason2-8b
# Serves at http://localhost:8000/v1
pip install vllm
vllm serve nvidia/Cosmos-Reason2-8B \
--allowed-local-media-path "$(pwd)" \
--max-model-len 16384 \
--reasoning-parser qwen3 \
--port 8000
# INT4 quantized 2B model — runs on 8GB VRAM
pip install vllm
vllm serve embedl/Cosmos-Reason2-2B-W4A16-Edge2 \
--allowed-local-media-path "$(pwd)" \
--max-model-len 8192 \
--gpu-memory-utilization 0.75 \
--port 8000
brew install llama.cpp
huggingface-cli download prithivMLmods/Cosmos-Reason2-8B-GGUF \
Cosmos-Reason2-8B.Q8_0.gguf \
Cosmos-Reason2-8B.mmproj-q8_0.gguf \
--local-dir ./models
# Note: GGUF uses llama-mtmd-cli directly,
# not an OpenAI-compatible server.
# Use demo.py with --endpoint for server-based options.