{ "cells": [ { "cell_type": "markdown", "id": "e7bdfb61-0e65-48c9-ab90-74cb19436615", "metadata": {}, "source": [ "# CHMv2 Model Inference" ] }, { "cell_type": "markdown", "id": "22ee78ba-5796-4cf2-8dd8-0babec77c297", "metadata": {}, "source": [ "### Setup" ] }, { "cell_type": "code", "execution_count": null, "id": "ccfbed2e-c938-4ae1-9b11-e3c12a4086dc", "metadata": {}, "outputs": [], "source": [ "import io\n", "import sys\n", "import urllib\n", "\n", "import matplotlib.pyplot as plt\n", "import rasterio\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "from torchvision.transforms import v2\n", "from tqdm import tqdm\n", "\n", "\n", "# Set the DINOv3 repo location\n", "DINOV3_LOCATION = \"/PATH/TO/DINOV3/REPO\"\n", "print(f\"DINOv3 location set to {DINOV3_LOCATION}\")\n", "\n", "sys.path.append(DINOV3_LOCATION)" ] }, { "cell_type": "markdown", "id": "7ff9859a-71d6-4808-8187-eea8e625e4d5", "metadata": {}, "source": [ "### Model Loading\n", "We load the DINOv3 ViT-L satellite model and the corresponding CHMv2 decoder." ] }, { "cell_type": "code", "execution_count": 2, "id": "2db67419-3446-41c9-968b-c4eaa991e3f6", "metadata": { "output": { "id": 1817267195625465, "loadingStatus": "loaded" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Backbone does not define embed_dims, using [1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024] instead\n", "Backbone does not define input_pad_size, using patch_size=16 instead\n" ] }, { "data": { "text/plain": [ "Depther(\n", " (encoder): DinoVisionTransformerWrapper(\n", " (backbone): DinoVisionTransformer(\n", " (patch_embed): PatchEmbed(\n", " (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n", " (norm): Identity()\n", " )\n", " (rope_embed): RopePositionEmbedding()\n", " (blocks): ModuleList(\n", " (0-23): 24 x SelfAttentionBlock(\n", " (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (attn): SelfAttention(\n", " (qkv): LinearKMaskedBias(in_features=1024, out_features=3072, bias=True)\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls1): LayerScale()\n", " (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): LayerScale()\n", " )\n", " )\n", " (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (local_cls_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (head): Identity()\n", " )\n", " (patch_size_adapter): CenterPadding()\n", " )\n", " (decoder): CHMv2Head(\n", " (reassemble_blocks): ReassembleBlocks(\n", " (projects): ModuleList(\n", " (0): ConvModule(\n", " (conv): Conv2d(1024, 128, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ConvModule(\n", " (conv): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ConvModule(\n", " (conv): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (3): ConvModule(\n", " (conv): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resize_layers): ModuleList(\n", " (0): ConvTranspose2d(128, 128, kernel_size=(4, 4), stride=(4, 4))\n", " (1): ConvTranspose2d(256, 256, kernel_size=(2, 2), stride=(2, 2))\n", " (2): Identity()\n", " (3): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " (readout_projects): ModuleList(\n", " (0-3): 4 x Sequential(\n", " (0): Linear(in_features=2048, out_features=1024, bias=True)\n", " (1): GELU(approximate='none')\n", " )\n", " )\n", " (batchnorm_layers): ModuleList(\n", " (0-3): 4 x Identity()\n", " )\n", " )\n", " (convs): ModuleList(\n", " (0): ConvModule(\n", " (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", " )\n", " (1): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", " )\n", " (2): ConvModule(\n", " (conv): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", " )\n", " (3): ConvModule(\n", " (conv): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", " )\n", " )\n", " (fusion_blocks): ModuleList(\n", " (0): FeatureFusionBlock(\n", " (project): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (res_conv_unit1): None\n", " (res_conv_unit2): PreActResidualConvUnit(\n", " (conv1): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " (conv2): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " )\n", " )\n", " (1-3): 3 x FeatureFusionBlock(\n", " (project): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (res_conv_unit1): PreActResidualConvUnit(\n", " (conv1): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " (conv2): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " )\n", " (res_conv_unit2): PreActResidualConvUnit(\n", " (conv1): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " (conv2): ConvModule(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (activate): ReLU()\n", " )\n", " )\n", " )\n", " )\n", " (conv_depth): UpConvHeadCHMv2(\n", " (head): Sequential(\n", " (0): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (1): Interpolate()\n", " (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (3): ReLU()\n", " (4): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " )\n", " (features_to_depth): FeaturesToDepth()\n", ")" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dinov3.hub.backbones import Weights\n", "\n", "chmv2_model = torch.hub.load(\n", " DINOV3_LOCATION,\n", " 'dinov3_vitl16_chmv2',\n", " source=\"local\",\n", " weights=\"\",\n", " backbone_weights=Weights.SAT493M, # or \n", ")\n", "chmv2_model.cuda().eval()" ] }, { "cell_type": "markdown", "id": "9c8170fa-224b-4df4-9863-aca11baae1c1", "metadata": {}, "source": [ "### Data Loading\n", "Now that we have the model set up, let's load the data" ] }, { "cell_type": "code", "execution_count": 3, "id": "fc300746-94a7-4248-8c4b-f5bce7a7e55f", "metadata": {}, "outputs": [], "source": [ "# Using test samples from the NEON dataset that can be downloaded following instructions in\n", "# https://github.com/facebookresearch/HighResCanopyHeight\n", "# Original dataset: National Ecological Observatory Network (NEON), 2022. Ecosystem Structure\n", "# URL: https://data.neonscience.org/data-products/DP3.30015.001.\n", "neon_images_uri = [\n", " \"https://dl.fbaipublicfiles.com/dinov3/notebooks/chmv2/2017_WLOU_1_NEON_D13_WLOU_DP3_419000_4416000_RGB.tif_1_1.tif\",\n", " \"https://dl.fbaipublicfiles.com/dinov3/notebooks/chmv2/2018_GUAN_1_NEON_D04_GUAN_DP3_725000_1985000_RGB.tif_2_1.tif\",\n", " \"https://dl.fbaipublicfiles.com/dinov3/notebooks/chmv2/2019_HOPB_3_NEON_D01_HOPB_DP3_717000_4705000_RGB.tif_1_1.tif\",\n", " \"https://dl.fbaipublicfiles.com/dinov3/notebooks/chmv2/2019_REDB_2_NEON_D15_REDB_DP3_433000_4516000_RGB.tif_2_2.tif\",\n", " \"https://dl.fbaipublicfiles.com/dinov3/notebooks/chmv2/2019_WLOU_2_NEON_D13_WLOU_DP3_420000_4417000_RGB.tif_0_0.tif\",\n", "\n", "]\n", "neon_images_list = []\n", "\n", "\n", "def load_image_as_tensor(uri: str) -> torch.Tensor:\n", " \"\"\"Load a rasterio image from URI as a PyTorch tensor.\"\"\"\n", " with urllib.request.urlopen(uri) as response:\n", " data = response.read()\n", " with rasterio.open(io.BytesIO(data)) as src:\n", " img = src.read()\n", " return torch.from_numpy(img)\n", "\n", "for neon_image_uri in neon_images_uri:\n", " neon_images_list.append(load_image_as_tensor(neon_image_uri))" ] }, { "cell_type": "markdown", "id": "33ae54fb-05bc-40d0-af41-6d60feb32a45", "metadata": {}, "source": [ "### Data Transforms" ] }, { "cell_type": "code", "execution_count": 4, "id": "29a5affc-e665-4cdb-93e7-da5e681b4a06", "metadata": {}, "outputs": [], "source": [ "CHMV2_MEAN = (0.420, 0.411, 0.296)\n", "CHMV2_STD = (0.213, 0.156, 0.143)\n", "\n", "def make_transform():\n", " to_tensor = v2.ToImage()\n", " to_float = v2.ToDtype(torch.float32, scale=True)\n", " normalize = v2.Normalize(mean=CHMV2_MEAN, std=CHMV2_STD)\n", " return v2.Compose([to_tensor, to_float, normalize])" ] }, { "cell_type": "markdown", "id": "728ebe9b-38ea-4a2d-8ccc-d232a207b14c", "metadata": {}, "source": [ "### Producing the canopy height map" ] }, { "cell_type": "code", "execution_count": 5, "id": "56357bef-4c28-492f-b531-e46b8bbb0284", "metadata": { "output": { "id": 1650136709675973, "loadingStatus": "loaded" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\rProcessing images: 0%| | 0/5 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "n = len(canopy_height_results)\n", "fig, axes = plt.subplots(n, 2, figsize=(10, 4 * n))\n", "\n", "for i, res in enumerate(canopy_height_results):\n", " axes[i, 0].imshow(neon_images_list[i].permute(1, 2, 0))\n", " axes[i, 0].set_title(f\"Sample {i}\")\n", " axes[i, 0].axis('off')\n", "\n", " axes[i, 1].imshow(canopy_height_results[i], cmap='viridis')\n", " axes[i, 1].set_title(\"Canopy height prediction\")\n", " axes[i, 1].axis('off')\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] } ], "metadata": { "fileHeader": "", "fileUid": "5d35d3c1-b110-46e7-a774-077d511f7f24", "isAdHoc": false, "kernelspec": { "display_name": "fairvit-py311-ptnightly-xformers-20250419 (conda)", "language": "python", "name": "conda_fairvit-py311-ptnightly-xformers-20250419" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }