{ "cells": [ { "cell_type": "markdown", "id": "license-header", "metadata": {}, "source": [ "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cosmos3 Generator Audiovisual with Cosmos Framework\n", "\n", "This notebook runs Cosmos3 audiovisual generation through the native Cosmos Framework PyTorch entrypoint:\n", "\n", "```bash\n", "python -m cosmos_framework.scripts.inference\n", "```\n", "\n", "Run all Cosmos3-Nano examples first, then run the Cosmos3-Super T2V/I2V examples without audio. Each section uses the matching checkpoint explicitly.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Prerequisites\n", "\n", "Use a Linux machine with NVIDIA GPU access, model access on Hugging Face, and either `uvx hf@latest auth login` or `HF_TOKEN` set. Use cache/output paths with enough disk space.\n", "\n", "> **Headless servers:** if you see an error like `libxcb.so.1: cannot open shared object file` (a missing system graphics library) when importing or running the model, install the required system libraries:\n", ">\n", "> ```bash\n", "> apt-get install -y libxcb1 libgl1 libglib2.0-0\n", "> ```\n", "\n", "> **uv version:** this notebook needs `uv >= 0.11.3` (enforced by the framework's `pyproject.toml`). Older versions fail to parse the project config (e.g. the `[tool.uv.audit]` section) and do not recognize newer `--torch-backend` values such as `cu130`. If you hit version-related errors, upgrade with `uv self update` (or reinstall from https://astral.sh/uv).\n", "" ], "id": "23c5bfd9" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Configure Paths and Environment\n", "\n", "The defaults are relative to this `cosmos` checkout and use the CUDA 13 or 12.8 dependency group depending on the CUDA version installed on your system (`cu130-train` or `cu128-train`):\n", "\n", "```bash\n", "export COSMOS3_REPO=/path/to/cosmos-framework\n", "export COSMOS3_UV_GROUP=cu130-train\n", "export UV_PROJECT_ENVIRONMENT=/path/to/large/uv/venvs/cosmos3-audiovisual\n", "export COSMOS3_NUM_GPUS=4\n", "export HF_HOME=/path/to/large/huggingface/cache\n", "export CUDA_VISIBLE_DEVICES=0,1,2,3\n", "```\n" ], "id": "a911caf3" }, { "cell_type": "code", "metadata": {}, "source": [ "from pathlib import Path\n", "import os\n", "import socket\n", "\n", "\n", "def find_repo_root(start: Path) -> Path:\n", " for path in [start, *start.parents]:\n", " if (path / \"README.md\").exists() and (path / \"cookbooks\").exists():\n", " return path\n", " return start\n", "\n", "\n", "def free_local_port() -> str:\n", " with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:\n", " sock.bind((\"127.0.0.1\", 0))\n", " return str(sock.getsockname()[1])\n", "\n", "\n", "COSMOS_ROOT = find_repo_root(Path.cwd().resolve())\n", "def default_framework_repo(root: Path) -> Path:\n", " candidates = [\n", " root / \"packages\" / \"cosmos-framework\",\n", " root / \"packages\" / \"cosmos3\",\n", " ]\n", " for candidate in candidates:\n", " if (candidate / \"pyproject.toml\").exists() and (candidate / \"cosmos_framework\").exists():\n", " return candidate\n", " return candidates[0]\n", "\n", "\n", "COSMOS3_REPO = Path(os.environ.get(\"COSMOS3_REPO\", default_framework_repo(COSMOS_ROOT))).resolve()\n", "COSMOS3_GIT_URL = os.environ.get(\"COSMOS3_GIT_URL\", \"git@github.com:NVIDIA/cosmos-framework.git\")\n", "COSMOS3_UV_GROUP = os.environ.get(\"COSMOS3_UV_GROUP\", \"cu130-train\")\n", "COSMOS3_UV_ENV = Path(os.environ.get(\"UV_PROJECT_ENVIRONMENT\", COSMOS3_REPO / \".venv\")).resolve()\n", "COSMOS3_NUM_GPUS = os.environ.get(\"COSMOS3_NUM_GPUS\", \"4\")\n", "COSMOS3_AUDIOVISUAL_ROOT = COSMOS_ROOT / \"cookbooks\" / \"cosmos3\" / \"generator\" / \"audiovisual\"\n", "COSMOS3_AUDIOVISUAL_OUTPUT_ROOT = Path(\n", " os.environ.get(\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\", COSMOS3_AUDIOVISUAL_ROOT / \"outputs\" / \"notebooks\")\n", ").resolve()\n", "\n", "os.environ[\"COSMOS3_REPO\"] = str(COSMOS3_REPO)\n", "os.environ[\"COSMOS3_GIT_URL\"] = COSMOS3_GIT_URL\n", "os.environ[\"COSMOS3_UV_GROUP\"] = COSMOS3_UV_GROUP\n", "os.environ[\"COSMOS3_UV_ENV\"] = str(COSMOS3_UV_ENV)\n", "os.environ.setdefault(\"UV_PROJECT_ENVIRONMENT\", str(COSMOS3_UV_ENV))\n", "os.environ[\"COSMOS3_NUM_GPUS\"] = COSMOS3_NUM_GPUS\n", "os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"] = str(COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n", "os.environ.setdefault(\"UV_CACHE_DIR\", str(Path.home() / \".cache\" / \"uv\"))\n", "os.environ.setdefault(\"HF_HOME\", str(Path.home() / \".cache\" / \"huggingface\"))\n", "os.environ.setdefault(\"CUDA_VISIBLE_DEVICES\", \"0,1,2,3\")\n", "os.environ.setdefault(\"COSMOS3_MASTER_ADDR\", \"127.0.0.1\")\n", "os.environ.setdefault(\"COSMOS3_TEXT_MASTER_PORT\", free_local_port())\n", "os.environ.setdefault(\"COSMOS3_IMAGE_MASTER_PORT\", free_local_port())\n", "\n", "print(f\"COSMOS_ROOT: {COSMOS_ROOT}\")\n", "for key in [\n", " \"COSMOS3_REPO\",\n", " \"COSMOS3_GIT_URL\",\n", " \"COSMOS3_UV_GROUP\",\n", " \"COSMOS3_UV_ENV\",\n", " \"COSMOS3_NUM_GPUS\",\n", " \"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\",\n", " \"UV_CACHE_DIR\",\n", " \"UV_PROJECT_ENVIRONMENT\",\n", " \"HF_HOME\",\n", " \"CUDA_VISIBLE_DEVICES\",\n", "]:\n", " print(f\"{key}: {os.environ[key]}\")\n" ], "execution_count": null, "outputs": [], "id": "de55c30b" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Clone or Reuse Cosmos Framework\n" ] }, { "cell_type": "code", "metadata": {}, "source": [ "%%bash\n", "set -euo pipefail\n", "\n", "mkdir -p \"$(dirname \"$COSMOS3_REPO\")\"\n", "\n", "if [ -f \"$COSMOS3_REPO/pyproject.toml\" ] && [ -d \"$COSMOS3_REPO/cosmos_framework\" ]; then\n", " echo \"Using existing framework checkout: $COSMOS3_REPO\"\n", "elif [ -e \"$COSMOS3_REPO\" ]; then\n", " echo \"COSMOS3_REPO exists but is not a Cosmos Framework checkout: $COSMOS3_REPO\"\n", " echo \"Set COSMOS3_REPO to the staged framework checkout, for example packages/cosmos-framework.\"\n", " exit 1\n", "else\n", " echo \"Cloning $COSMOS3_GIT_URL into $COSMOS3_REPO\"\n", " git clone \"$COSMOS3_GIT_URL\" \"$COSMOS3_REPO\"\n", "fi\n", "\n", "cd \"$COSMOS3_REPO\"\n", "git status --short --branch\n", "git remote -v\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Install Native PyTorch Dependencies\n", "\n", "Installs framework dependencies with `cu130-train` by default.\n" ] }, { "cell_type": "code", "metadata": {}, "source": [ "%%bash\n", "set -euo pipefail\n", "\n", "if ! command -v uv >/dev/null 2>&1; then\n", " echo \"uv is not installed. Install it first: https://docs.astral.sh/uv/getting-started/installation/\"\n", " exit 1\n", "fi\n", "\n", "export GIT_LFS_SKIP_SMUDGE=1\n", "cd \"$COSMOS3_REPO\"\n", "export UV_PROJECT_ENVIRONMENT=\"${UV_PROJECT_ENVIRONMENT:-$COSMOS3_UV_ENV}\"\n", "echo \"Using UV_PROJECT_ENVIRONMENT=$UV_PROJECT_ENVIRONMENT\"\n", "uv sync --all-extras --group=\"$COSMOS3_UV_GROUP\"\n", "if [ ! -x \"$COSMOS3_UV_ENV/bin/python\" ]; then\n", " echo \"uv sync completed, but expected Python is missing: $COSMOS3_UV_ENV/bin/python\"\n", " echo \"Check UV_PROJECT_ENVIRONMENT above; uv created the environment somewhere else or the install failed.\"\n", " exit 1\n", "fi\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Verify GPU and Python Environment\n" ] }, { "cell_type": "code", "metadata": {}, "source": [ "%%bash\n", "set -euo pipefail\n", "\n", "cd \"$COSMOS3_REPO\"\n", "if [ ! -x \"$COSMOS3_UV_ENV/bin/python\" ]; then\n", " echo \"Missing $COSMOS3_UV_ENV/bin/python\"\n", " echo \"Run the Install Native PyTorch Dependencies cell first, or set UV_PROJECT_ENVIRONMENT/COSMOS3_UV_ENV to the uv environment path.\"\n", " exit 1\n", "fi\n", "CUDA_VISIBLE_DEVICES=\"$CUDA_VISIBLE_DEVICES\" \"$COSMOS3_UV_ENV/bin/python\" - <<'PY'\n", "import torch\n", "print(\"torch:\", torch.__version__)\n", "print(\"torch cuda:\", torch.version.cuda)\n", "print(\"cuda available:\", torch.cuda.is_available())\n", "print(\"device count:\", torch.cuda.device_count())\n", "for index in range(torch.cuda.device_count()):\n", " print(f\"device {index}:\", torch.cuda.get_device_name(index))\n", "PY\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Preview Available Inputs\n" ] }, { "cell_type": "code", "metadata": {}, "source": [ "from pathlib import Path\n", "import json\n", "from IPython.display import Image, display\n", "\n", "assets_dir = COSMOS3_AUDIOVISUAL_ROOT / \"assets\"\n", "for prompt_dir in sorted((assets_dir / \"prompts\").iterdir()):\n", " if not prompt_dir.is_dir():\n", " continue\n", " print(f\"{prompt_dir.relative_to(assets_dir)}:\")\n", " for prompt_path in sorted(prompt_dir.glob(\"*.json\")):\n", " data = json.loads(prompt_path.read_text())\n", " caption = (\n", " data.get(\"temporal_caption\")\n", " or data.get(\"comprehensive_t2i_caption\")\n", " or data.get(\"extra\", {}).get(\"prompt\", \"\")\n", " )\n", " print(f\" {prompt_path.name}: {caption[:180]}{'...' if len(caption) > 180 else ''}\")\n", " print()\n", "\n", "for image_dir in sorted((assets_dir / \"images\").iterdir()):\n", " if not image_dir.is_dir():\n", " continue\n", " print(f\"{image_dir.relative_to(assets_dir)}:\")\n", " for image_path in sorted(image_dir.iterdir()):\n", " if image_path.suffix.lower() in {\".jpg\", \".jpeg\", \".png\", \".webp\", \".bmp\"}:\n", " print(f\" {image_path.name}\")\n", " display(Image(filename=str(image_path), width=420))\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Define Asset Sets, Payload Helpers, and Viewer Helpers\n" ] }, { "cell_type": "code", "metadata": {}, "source": [ "import json\n", "import os\n", "from pathlib import Path\n", "from IPython.display import Image, display\n", "\n", "IMAGE_EXTENSIONS = {\".jpg\", \".jpeg\", \".png\", \".webp\", \".bmp\"}\n", "\n", "FIXED_SAMPLING = {\n", " \"num_steps\": 35,\n", " \"guidance\": 6.0,\n", " \"shift\": 10.0,\n", " \"fps\": 24,\n", " \"num_frames\": 189,\n", " \"resolution\": \"720\",\n", " \"aspect_ratio\": \"16,9\",\n", " \"seed\": 0,\n", "}\n", "\n", "# All asset paths are repo-relative under cookbooks/cosmos3/generator/audiovisual.\n", "# The Nano/Super cases without audio are intentionally separate so each run uses the matching model.\n", "ASSET_SETS = {\n", " \"t2i\": {\n", " \"model\": \"Cosmos3-Nano\",\n", " \"mode\": \"text2image\",\n", " \"prompt\": \"assets/prompts/text2image/robot_draping.json\",\n", " \"enable_sound\": False,\n", " },\n", " \"t2i_super\": {\n", " \"model\": \"Cosmos3-Super\",\n", " \"mode\": \"text2image\",\n", " \"prompt\": \"assets/prompts/text2image/robot_draping.json\",\n", " \"enable_sound\": False,\n", " },\n", " \"t2v_nano_noaudio\": {\n", " \"model\": \"Cosmos3-Nano\",\n", " \"mode\": \"text2video\",\n", " \"prompt\": \"assets/prompts/text2video/robot_kitchen.json\",\n", " \"enable_sound\": False,\n", " },\n", " \"t2vs\": {\n", " \"model\": \"Cosmos3-Nano\",\n", " \"mode\": \"text2video\",\n", " \"prompt\": \"assets/prompts/text2video/robot_pouring_water_audio.json\",\n", " \"enable_sound\": True,\n", " },\n", " \"i2v_nano_noaudio\": {\n", " \"model\": \"Cosmos3-Nano\",\n", " \"mode\": \"image2video\",\n", " \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n", " \"image\": \"assets/images/image2video/car_driving.jpg\",\n", " \"enable_sound\": False,\n", " },\n", " \"i2vs\": {\n", " \"model\": \"Cosmos3-Nano\",\n", " \"mode\": \"image2video\",\n", " \"prompt\": \"assets/prompts/image2video/coastal_road_audio.json\",\n", " \"image\": \"assets/images/image2video/coastal_road_audio.jpg\",\n", " \"enable_sound\": True,\n", " },\n", " \"t2v_super_noaudio\": {\n", " \"model\": \"Cosmos3-Super\",\n", " \"mode\": \"text2video\",\n", " \"prompt\": \"assets/prompts/text2video/robot_kitchen.json\",\n", " \"enable_sound\": False,\n", " },\n", " \"i2v_super_noaudio\": {\n", " \"model\": \"Cosmos3-Super\",\n", " \"mode\": \"image2video\",\n", " \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n", " \"image\": \"assets/images/image2video/car_driving.jpg\",\n", " \"enable_sound\": False,\n", " },\n", "}\n", "\n", "\n", "def asset_path(relative_path: str) -> Path:\n", " path = COSMOS3_AUDIOVISUAL_ROOT / relative_path\n", " if not path.exists():\n", " raise FileNotFoundError(path)\n", " return path.resolve()\n", "\n", "\n", "def compact_json_file(path: Path) -> str:\n", " return json.dumps(json.loads(path.read_text()), ensure_ascii=True, separators=(\",\", \":\"))\n", "\n", "\n", "def payload_dimensions(payload: dict) -> tuple[int, int]:\n", " if payload.get(\"resolution\") == \"720\" and payload.get(\"aspect_ratio\") == \"16,9\":\n", " return 720, 1280\n", " if payload.get(\"resolution\") == \"256\" and payload.get(\"aspect_ratio\") == \"16,9\":\n", " return 192, 320\n", " raise ValueError(f\"Unsupported payload resolution/aspect ratio: {payload.get('resolution')} {payload.get('aspect_ratio')}\")\n", "\n", "\n", "def resolve_payload_path(payload_path: Path, value: str) -> Path:\n", " path = Path(value)\n", " if path.is_absolute():\n", " return path\n", " return (payload_path.parent / path).resolve()\n", "\n", "\n", "def create_payload(use_case: str, *, backend: str) -> tuple[Path, Path, str]:\n", " spec = ASSET_SETS[use_case]\n", " payload_dir = Path(os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"]) / backend / \"payloads\" / use_case\n", " output_dir = Path(os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"]) / backend / use_case\n", " payload_dir.mkdir(parents=True, exist_ok=True)\n", " output_dir.mkdir(parents=True, exist_ok=True)\n", "\n", " prompt_path = asset_path(spec[\"prompt\"])\n", " negative_prompt = \"\"\n", " if spec[\"mode\"] != \"text2image\":\n", " negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n", " negative_prompt = compact_json_file(negative_prompt_path)\n", " payload_path = payload_dir / f\"{use_case}.json\"\n", " payload = {\n", " \"model_mode\": spec[\"mode\"],\n", " \"name\": use_case,\n", " \"prompt\": compact_json_file(prompt_path),\n", " \"negative_prompt\": negative_prompt,\n", " \"enable_sound\": spec[\"enable_sound\"],\n", " **FIXED_SAMPLING,\n", " }\n", " if spec[\"mode\"] == \"text2image\":\n", " payload[\"num_frames\"] = 1\n", " if spec[\"mode\"] == \"image2video\":\n", " image_path = asset_path(spec[\"image\"])\n", " payload[\"vision_path\"] = os.path.relpath(image_path, payload_path.parent)\n", "\n", " payload_path.write_text(json.dumps(payload, indent=2) + \"\\n\")\n", "\n", " os.environ[f\"COSMOS3_{backend.upper()}_{use_case.upper()}_INPUT\"] = str(payload_path)\n", " os.environ[f\"COSMOS3_{backend.upper()}_{use_case.upper()}_OUTPUT\"] = str(output_dir)\n", "\n", " print(f\"model: {spec['model']}\")\n", " print(f\"payload: {payload_path}\")\n", " print(f\"output: {output_dir}\")\n", " print(f\"prompt: {prompt_path.relative_to(COSMOS_ROOT)}\")\n", " if \"vision_path\" in payload:\n", " image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n", " print(f\"image: {image_display_path.relative_to(COSMOS_ROOT)}\")\n", " display(Image(filename=str(image_display_path), width=420))\n", " print(json.dumps({k: payload[k] for k in [\"model_mode\", \"name\", \"enable_sound\", \"num_steps\", \"guidance\", \"shift\", \"fps\", \"num_frames\", \"resolution\", \"aspect_ratio\", \"seed\"]}, indent=2))\n", " return payload_path, output_dir, spec[\"model\"]\n", "\n", "\n", "import base64\n", "import html\n", "from pathlib import Path\n", "from IPython.display import HTML, display\n", "\n", "\n", "def display_video(path: Path, *, width: int = 720) -> None:\n", " data = base64.b64encode(path.read_bytes()).decode(\"ascii\")\n", " label = html.escape(str(path))\n", " markup = f\"\"\"\n", "\n", "