diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 8a90770..5a0dc0d 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [3.1.0] - 2026-05-31 ### Fixed @@ -142,7 +142,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Mobile deployment guide (CoreML, TFLite) - Soft-NMS vectorization with torch.cdist for 10-50x inference speedup -## [2.10] - 2025-01-18 +## [2.1.0] - 2025-01-18 ### Add GitHub Workflows @@ -261,6 +261,5 @@ We use [Semantic Versioning](https://semver.org/): ## Links -- [Unreleased]: https://github.com/dronefreak/VisDrone-dataset-python-toolkit/compare/v2.0.0...HEAD - [2.0.0]: https://github.com/dronefreak/VisDrone-dataset-python-toolkit/releases/tag/v2.0.0 - [1.0.0]: https://github.com/dronefreak/VisDrone-dataset-python-toolkit/releases/tag/v1.0.0 diff --git a/.github/README.md b/.github/README.md index ef4cc03..7d04735 100644 --- a/.github/README.md +++ b/.github/README.md @@ -1,400 +1,146 @@ -# VisDrone Toolkit 2.0 +# VisDrone Toolkit -[![๐Ÿ Python 3.8+](https://img.shields.io/badge/Python-3.8+-3776AB?logo=python&logoColor=white&style=for-the-badge)](https://www.python.org/downloads/) -[![๐Ÿ”ฅ PyTorch 2.0+](https://img.shields.io/badge/PyTorch-2.0+-EE4C2C?logo=pytorch&logoColor=white&style=for-the-badge)](https://pytorch.org/) -[![โš–๏ธ License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-green?logo=open-source-initiative&logoColor=white&style=for-the-badge)](LICENSE) -[![๐Ÿ–ค Code style: Black](https://img.shields.io/badge/Code%20Style-Black-000000?logo=python&logoColor=white&style=for-the-badge)](https://github.com/psf/black) +[![Python 3.8+](https://img.shields.io/badge/Python-3.8+-3776AB?logo=python&logoColor=white)](https://www.python.org/) +[![PyTorch 2.0+](https://img.shields.io/badge/PyTorch-2.0+-EE4C2C?logo=pytorch&logoColor=white)](https://pytorch.org/) +[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-green)](LICENSE) -Modern PyTorch toolkit for the VisDrone aerial object detection dataset with production-ready training pipelines, real-time inference, and optimizations for small object detection in drone imagery. +PyTorch toolkit for the [VisDrone aerial detection dataset](https://github.com/VisDrone/VisDrone-Dataset). Supports 33 models (4 torchvision + 29 YOLO), end-to-end training, evaluation, and inference. --- -## What's New in 2.0 - -**Core Improvements:** - -- PyTorch-first design with native Dataset implementation -- Multi-architecture support: Faster R-CNN, FCOS, RetinaNet (ResNet50 & MobileNet variants) -- Real-time webcam inference with pre-trained weights -- Modern format converters (COCO & YOLO, not just VOC) -- Production-ready CLI tools with rich progress tracking -- Advanced training features: data augmentation, multi-scale training, gradient accumulation -- Test-Time Augmentation (TTA) and Soft-NMS for improved accuracy - ---- - -## ๐Ÿ“ธ Detection Examples - -The model demonstrates robust performance across various aerial scenarios from the VisDrone dataset: - - - - - - - - - - -
- Urban Traffic Scene -
- Urban Traffic Detection -
- Dense vehicle and pedestrian detection in city intersection -
- Parking Lot Scene -
- Vertical Orientation -
- Multi-scale vehicle detection with varying orientations -
- Pedestrian Scene -
- Weather Effects -
- Detection of vehicles in extreme sunflare -
- Mixed Traffic -
- Mixed Traffic Analysis -
- Detection of cars, motorcycles, and pedestrians -
- -### Model Performance - -The Faster R-CNN ResNet50 model achieves state-of-the-art performance on VisDrone validation set: - -| Metric | Score | -| --------------- | --------- | -| **F1 Score** | **66.7%** | -| **Precision** | 71.0% | -| **Recall** | 62.9% | -| True Positives | 24,385 | -| False Positives | 9,951 | - -**Inference Speed** (RTX 4070 Super 12GB): - -- Standard: **55.6ms/image (18 FPS)** -- With TTA + Soft-NMS: ~333ms/image (3 FPS) - best quality - -### Usage Notes - -- **Recommended threshold**: 0.3-0.5 depending on precision/recall preference -- **Best for**: Top-down or oblique aerial views (10-100m altitude) -- **Challenges**: Objects <15 pixels, heavy occlusion, extreme viewing angles -- **Classes**: Detects 11 object categories (pedestrian, car, van, truck, bicycle, motorcycle, etc.) - ---- - -## Quick Start +## Installation ```bash -# Install git clone https://github.com/dronefreak/VisDrone-dataset-python-toolkit.git cd VisDrone-dataset-python-toolkit -python3 -m venv venv && source venv/bin/activate -pip install -e . - -# Test instantly with webcam (no training required) -python scripts/webcam_demo.py --model fasterrcnn_mobilenet - -# Train with best practices -python scripts/train.py \ - --train-img-dir data/VisDrone2019-DET-train/images \ - --train-ann-dir data/VisDrone2019-DET-train/annotations \ - --val-img-dir data/VisDrone2019-DET-val/images \ - --val-ann-dir data/VisDrone2019-DET-val/annotations \ - --model fasterrcnn_resnet50 \ - --epochs 200 \ - --batch-size 2 \ - --accumulation-steps 2 \ - --lr 0.005 \ - --amp \ - --augmentation \ - --multiscale \ - --small-anchors \ - --lr-schedule multistep \ - --lr-milestones 60 80 \ - --output-dir outputs/fasterrcnn_improved - -# Run inference with TTA + Soft-NMS -python scripts/inference.py \ - --checkpoint outputs/fasterrcnn_improved/best_model.pth \ - --model fasterrcnn_resnet50 \ - --input test_images/ \ - --output-dir results \ - --score-threshold 0.5 \ - --tta \ - --soft-nms \ - --nms-threshold 0.3 +python -m venv venv && source venv/bin/activate +pip install -e . # basic +pip install -e ".[dev]" # with dev tools ``` ---- - -## Features - -### Core Components - -- **PyTorch Dataset** โ€” Native VisDrone format with automatic filtering and multi-scale support -- **Model Zoo** โ€” 4 detection architectures ready for training -- **Format Converters** โ€” COCO and YOLO export with validation -- **Visualization** โ€” Publication-ready plots and detection overlays -- **CLI Tools** โ€” Train, evaluate, and infer with simple commands - -### Training Features - -- Mixed precision training (AMP) for 2x speedup -- Gradient accumulation for larger effective batch sizes -- Data augmentation (flips, rotations, color jitter, blur, fog) -- Multi-scale training (600-800px random scaling) -- Small anchor optimization for tiny aerial objects -- Multi-GPU support via DistributedDataParallel -- Learning rate scheduling (step, multistep, cosine) -- Automatic checkpointing with keyboard interrupt handling -- Real-time metrics with rich progress bars - -### Inference Features - -- Test-Time Augmentation (TTA) with multi-scale + flips -- Soft-NMS post-processing for better recall -- Real-time webcam detection -- Batch processing for images and videos -- Configurable confidence thresholds -- FPS benchmarking and performance profiling - ---- - -## Installation - -### Requirements - -- Python 3.8+ -- CUDA-capable GPU (recommended, 12GB+ VRAM for training) -- PyTorch 2.0+ - -### Setup +**Dataset layout** (download from [VisDrone-Dataset](https://github.com/VisDrone/VisDrone-Dataset)): ```bash -# 1. Virtual environment -python3 -m venv venv -source venv/bin/activate # Windows: venv\Scripts\activate - -# 2. PyTorch (choose one) -pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118 # GPU -pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu # CPU - -# 3. Install toolkit with dependencies -pip install -e . # Basic -pip install -e ".[dev]" # With dev tools -pip install albumentations # For data augmentation +data/ +โ”œโ”€โ”€ VisDrone2019-DET-train/images/ annotations/ +โ””โ”€โ”€ VisDrone2019-DET-val/images/ annotations/ ``` -### Dataset Download +--- + +## Models -Download from [VisDrone Dataset](https://github.com/VisDrone/VisDrone-Dataset): +| Model | Type | Notes | +| ---------------------------------------------- | ----------- | ------------------------------- | +| `fasterrcnn_resnet50` / `fasterrcnn_mobilenet` | Torchvision | Best accuracy / lightweight | +| `fcos_resnet50` | Torchvision | Anchor-free | +| `retinanet_resnet50` | Torchvision | Focal loss, class imbalance | +| `yolov8n/s/m/l/x` | YOLO v8 | Recommended for new experiments | +| `yolov9c/e/m` | YOLO v9 | Programmable gradient info | +| `yolov10n/s/m/b/l/x` | YOLO v10 | NMS-free inference | +| `yolo11n/s/m/l/x` | YOLO 11 | 2024 C3k2+C2PSA architecture | +| `yolo26n/s/m/l/x` | YOLO 26 | 2025, best efficiency | ```bash -data/ -โ”œโ”€โ”€ VisDrone2019-DET-train/ -โ”‚ โ”œโ”€โ”€ images/ -โ”‚ โ””โ”€โ”€ annotations/ -โ””โ”€โ”€ VisDrone2019-DET-val/ - โ”œโ”€โ”€ images/ - โ””โ”€โ”€ annotations/ +python scripts/train.py --available-models # list all 33 models ``` -See [INSTALL.md](INSTALL.md) for detailed setup instructions. - --- ## Usage -### Training +### Train ```bash -# List all available models (torchvision + YOLO) -python scripts/train.py --available-models - -# Optimized training with FasterRCNN (200 epochs, ~40 hours on RTX 4070 Super) +# Torchvision (Faster R-CNN) python scripts/train.py \ --train-img-dir data/VisDrone2019-DET-train/images \ --train-ann-dir data/VisDrone2019-DET-train/annotations \ - --val-img-dir data/VisDrone2019-DET-val/images \ - --val-ann-dir data/VisDrone2019-DET-val/annotations \ - --model fasterrcnn_resnet50 \ - --epochs 200 \ - --batch-size 2 \ - --accumulation-steps 2 \ - --lr 0.005 \ - --amp \ - --augmentation \ - --multiscale \ - --small-anchors \ - --lr-schedule multistep \ - --lr-milestones 60 80 \ + --val-img-dir data/VisDrone2019-DET-val/images \ + --val-ann-dir data/VisDrone2019-DET-val/annotations \ + --model fasterrcnn_resnet50 --epochs 200 --batch-size 2 \ + --amp --augmentation --multiscale --small-anchors \ + --lr 0.005 --lr-schedule multistep --lr-milestones 60 80 \ --output-dir outputs/fasterrcnn_200ep -# Training with YOLO v8+ (faster, lighter, recommended for new experiments) +# YOLO (delegates to Ultralytics engine) python scripts/train.py \ --train-img-dir data/VisDrone2019-DET-train/images \ --train-ann-dir data/VisDrone2019-DET-train/annotations \ - --val-img-dir data/VisDrone2019-DET-val/images \ - --val-ann-dir data/VisDrone2019-DET-val/annotations \ - --model yolov8n \ - --epochs 200 \ - --batch-size 16 \ - --accumulation-steps 2 \ - --lr 0.001 \ - --amp \ - --augmentation \ - --lr-schedule cosine \ + --val-img-dir data/VisDrone2019-DET-val/images \ + --val-ann-dir data/VisDrone2019-DET-val/annotations \ + --model yolov8n --epochs 200 --batch-size 16 --amp \ --output-dir outputs/yolov8n_200ep +``` -# Fast training for experimentation (50 epochs, MobileNet) -python scripts/train.py \ - --train-img-dir data/VisDrone2019-DET-train/images \ - --train-ann-dir data/VisDrone2019-DET-train/annotations \ - --model fasterrcnn_mobilenet \ - --epochs 50 \ - --batch-size 4 \ - --amp \ - --output-dir outputs/mobilenet_quick +Weights are saved as `best.pt` and `last.pt` inside `--output-dir`. -# Resume from checkpoint -python scripts/train.py \ - --resume outputs/fasterrcnn_200ep/checkpoint_epoch_100.pth \ - --epochs 200 +> **YOLO note:** `--multiscale`, `--small-anchors`, `--lr-schedule`, and `--accumulation-steps` are ignored for YOLO models โ€” these are handled internally by Ultralytics. `--num-classes` is automatically clamped to 11 (VisDrone's 11 real classes). + +### Evaluate + +```bash +# Torchvision โ€” P/R/F1 + optional pycocotools mAP +python scripts/evaluate.py \ + --checkpoint outputs/fasterrcnn_200ep/best.pt \ + --model fasterrcnn_resnet50 \ + --image-dir data/VisDrone2019-DET-val/images \ + --annotation-dir data/VisDrone2019-DET-val/annotations + +# YOLO โ€” mAP@0.5 and mAP@0.5:0.95 via Ultralytics val engine +python scripts/evaluate.py \ + --checkpoint outputs/yolov8n_200ep/yolov8n/weights/best.pt \ + --model yolov8n \ + --image-dir data/VisDrone2019-DET-val/images \ + --annotation-dir data/VisDrone2019-DET-val/annotations ``` -**Available Models:** - -| Model | Type | Speed | Notes | -| --------------------------------------------------------- | ----------- | -------- | -------------------------- | -| `fasterrcnn_resnet50` | Torchvision | ~45 FPS | Best accuracy, high VRAM | -| `fasterrcnn_mobilenet` | Torchvision | ~80 FPS | Lightweight, fast | -| `fcos_resnet50` | Torchvision | ~55 FPS | Anchor-free | -| `retinanet_resnet50` | Torchvision | ~65 FPS | Good for small objects | -| `yolov8n` | YOLO v8 | ~280 FPS | Fastest v8, 1.5 GB VRAM | -| `yolov8s` / `yolov8m` / `yolov8l` / `yolov8x` | YOLO v8 | varies | Larger = more accurate | -| `yolov9c` / `yolov9e` / `yolov9m` | YOLO v9 | varies | Programmable gradient nets | -| `yolov10n` ... `yolov10x` | YOLO v10 | varies | NMS-free inference | -| `yolo11n` / `yolo11s` / `yolo11m` / `yolo11l` / `yolo11x` | YOLO11 | varies | 2024 C3k2+C2PSA arch | -| `yolo26n` / `yolo26s` / `yolo26m` / `yolo26l` / `yolo26x` | YOLO26 | varies | 2025, best efficiency | - -**Key Training Arguments:** - -- `--available-models` - List all registered models and exit -- `--augmentation` - Enable data augmentation (flips, rotations, color) -- `--multiscale` - Random image scaling 600-800px (torchvision only) -- `--small-anchors` - Use 16-256px anchors (torchvision only) -- `--accumulation-steps` - Simulate larger batch (2 steps = 2x batch size) -- `--lr-schedule cosine|multistep|step` - LR schedule type -- `--amp` - Mixed precision training (2x speedup) - -> **Note for YOLO models:** `--multiscale`, `--small-anchors`, `--lr-schedule`, and `--accumulation-steps` are ignored โ€” YOLO v8+ is anchor-free and these are handled internally by Ultralytics. Use `--batch-size 16` or higher (YOLO is much more memory-efficient than FasterRCNN). `--num-classes` is automatically clamped to 11 for YOLO (VisDrone's 11 real classes after filtering the ignored-regions label). +Outputs a rich per-class metrics table and saves `eval_outputs/metrics.json`. ### Inference ```bash -# Standard inference (fast) +# Images / directory / video โ€” auto-detected from file extension python scripts/inference.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --input test_images/ \ - --output-dir results \ - --score-threshold 0.5 - -# Best quality (TTA + Soft-NMS, slower but more accurate) -python scripts/inference.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --input test_images/ \ - --output-dir results_best \ - --score-threshold 0.5 \ - --tta \ - --soft-nms \ - --nms-threshold 0.3 - -# Video processing -python scripts/inference.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --input drone_video.mp4 \ - --output-dir results_video \ - --score-threshold 0.5 + --checkpoint outputs/yolov8n_200ep/yolov8n/weights/best.pt \ + --model yolov8n --input data/images/ --output-dir results -# Single image python scripts/inference.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --input image.jpg \ - --score-threshold 0.5 + --checkpoint outputs/fasterrcnn_200ep/best.pt \ + --model fasterrcnn_resnet50 --input drone_video.mp4 \ + --soft-nms --score-threshold 0.5 --output-dir results ``` -### Evaluation +### Webcam / Video Demo ```bash -# Evaluate with TTA + Soft-NMS (matches training metrics) -python scripts/evaluate.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --image-dir data/VisDrone2019-DET-val/images \ - --annotation-dir data/VisDrone2019-DET-val/annotations \ - --score-threshold 0.5 \ - --tta \ - --soft-nms \ - --output-dir eval_results \ - --save-predictions - -# Standard evaluation (faster) -python scripts/evaluate.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --image-dir data/VisDrone2019-DET-val/images \ - --annotation-dir data/VisDrone2019-DET-val/annotations \ - --score-threshold 0.5 \ - --output-dir eval_results -``` - -### Webcam Demo +# Webcam (default source=0) +python scripts/webcam_demo.py \ + --checkpoint outputs/yolov8n_200ep/yolov8n/weights/best.pt \ + --model yolov8n -```bash -# With trained model +# Video file or RTSP stream python scripts/webcam_demo.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --score-threshold 0.5 + --checkpoint outputs/fasterrcnn_200ep/best.pt \ + --model fasterrcnn_resnet50 --source drone_video.mp4 -# With pre-trained COCO weights (no training needed) +# COCO pretrained weights โ€” no VisDrone training needed python scripts/webcam_demo.py --model fasterrcnn_mobilenet - -# Custom camera and threshold -python scripts/webcam_demo.py \ - --checkpoint outputs/fasterrcnn_200ep/best_model.pth \ - --model fasterrcnn_resnet50 \ - --camera 1 \ - --score-threshold 0.7 ``` -**Controls:** `q` quit | `s` save frame | `SPACE` pause +Controls: `q` quit | `s` save frame | `Space` pause ### Format Conversion ```bash -# To COCO -python scripts/convert_annotations.py \ - --format coco \ - --image-dir data/images \ - --annotation-dir data/annotations \ +# VisDrone โ†’ COCO +python scripts/convert_annotations.py --format coco \ + --image-dir data/images --annotation-dir data/annotations \ --output annotations_coco.json -# To YOLO -python scripts/convert_annotations.py \ - --format yolo \ - --image-dir data/images \ - --annotation-dir data/annotations \ +# VisDrone โ†’ YOLO +python scripts/convert_annotations.py --format yolo \ + --image-dir data/images --annotation-dir data/annotations \ --output-dir data/yolo_labels ``` @@ -404,236 +150,63 @@ python scripts/convert_annotations.py \ from visdrone_toolkit import VisDroneDataset, get_model from visdrone_toolkit.utils import collate_fn from torch.utils.data import DataLoader -import torch - -# Load dataset with augmentation -from training_config import get_training_augmentation dataset = VisDroneDataset( image_dir="data/images", annotation_dir="data/annotations", - transforms=get_training_augmentation(), filter_ignored=True, filter_crowd=True, - multiscale_training=True, ) - -# Get model with custom configuration -model = get_model("fasterrcnn_resnet50", num_classes=12, pretrained=True) - -# Create dataloader loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn, shuffle=True) - -# Training loop -model.train() -optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9) - -for images, targets in loader: - loss_dict = model(images, targets) - losses = sum(loss for loss in loss_dict.values()) - - optimizer.zero_grad() - losses.backward() - optimizer.step() -``` - ---- - -## Models - -| Model | Speed | Accuracy | GPU Memory | Best For | -| -------------------------- | ----- | -------- | ---------- | --------------------------- | -| **Faster R-CNN ResNet50** | โ˜…โ˜…โ˜…โ˜†โ˜† | โ˜…โ˜…โ˜…โ˜…โ˜† | 6GB | General use, best balance | -| **Faster R-CNN MobileNet** | โ˜…โ˜…โ˜…โ˜…โ˜… | โ˜…โ˜…โ˜…โ˜†โ˜† | 3GB | Real-time, edge devices | -| **FCOS ResNet50** | โ˜…โ˜…โ˜…โ˜†โ˜† | โ˜…โ˜…โ˜…โ˜…โ˜† | 6GB | Dense objects, anchor-free | -| **RetinaNet ResNet50** | โ˜…โ˜…โ˜…โ˜†โ˜† | โ˜…โ˜…โ˜…โ˜…โ˜† | 6GB | Class imbalance, focal loss | - -### Performance Benchmarks - -**VisDrone2019-DET-val** (RTX 4070 Super 12GB, batch_size=2): - -| Model | F1 Score | Precision | Recall | FPS (Standard) | FPS (TTA) | -| --------------------- | --------- | --------- | --------- | -------------- | --------- | -| Faster R-CNN ResNet50 | **66.7%** | **71.0%** | **62.9%** | **18** | **3** | -| FCOS ResNet50 | 48.8% | 43.8% | 55.1% | 16 | 2.5 | - -_Training: 200 epochs with augmentation, multi-scale, small anchors, and optimized hyperparameters_ - ---- - -## ๐Ÿค— HuggingFace Model Card - -Pre-trained weights are available on HuggingFace: - -```python -from visdrone_toolkit import get_model -import torch - -# Load model with trained weights -model = get_model('fasterrcnn_resnet50', num_classes=12, pretrained=False) -checkpoint = torch.load('path/to/best_model.pth') -model.load_state_dict(checkpoint['model_state_dict']) -model.eval() - -# Run inference (see inference.py for complete example) +model = get_model("fasterrcnn_resnet50", num_classes=12, pretrained=True) ``` -**Model Card**: [dronefreak/visdrone-fasterrcnn-resnet50](https://huggingface.co/dronefreak/visdrone-fasterrcnn-resnet50) - -**Training Details**: - -- **Dataset**: VisDrone2019-DET (6,471 training images) -- **Epochs**: 200 -- **Augmentation**: Horizontal flip, rotation, shift-scale-rotate, color jitter, blur, fog -- **Multi-scale**: Random scaling 600-800px per iteration -- **Optimization**: Small anchors (16-256px), Soft-NMS, lower NMS threshold (0.3) -- **Hardware**: Single RTX 4070 Super (12GB VRAM) -- **Training time**: ~40 hours - --- -## Advanced Usage - -### Custom Augmentations - -```python -import albumentations as A - -transform = A.Compose([ - A.HorizontalFlip(p=0.5), - A.RandomRotate90(p=0.3), - A.RandomBrightnessContrast(p=0.5), - A.HueSaturationValue(p=0.3), -], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels'])) - -dataset = VisDroneDataset( - image_dir="data/images", - annotation_dir="data/annotations", - transforms=transform, - multiscale_training=True, -) -``` - -### Multi-GPU Training - -```python -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel +## Performance -dist.init_process_group(backend='nccl') -model = DistributedDataParallel(model, device_ids=[local_rank]) -``` +Faster R-CNN ResNet50, VisDrone2019-DET-val (200 epochs, RTX 4070 Super): -### ONNX Export +| Metric | Score | +| --------- | ------------------- | +| F1 | 66.7% | +| Precision | 71.0% | +| Recall | 62.9% | +| Speed | 18 FPS (55ms/image) | -```python -import torch - -model.eval() -dummy_input = torch.randn(1, 3, 800, 800) -torch.onnx.export( - model, dummy_input, "model.onnx", - opset_version=11, - input_names=['input'], - output_names=['boxes', 'labels', 'scores'] -) -``` +YOLO v8n after 1 epoch (untrained baseline): mAP@0.5 = 0.119, mAP@0.5:0.95 = 0.062. --- -## Documentation - -- [Installation Guide](INSTALL.md) โ€” Detailed setup -- [Quick Reference](QUICKSTART.md) โ€” Command cheatsheet -- [Scripts Documentation](scripts/README.md) โ€” CLI tools -- [Configuration Guide](configs/README.md) โ€” Training configs -- [Test Documentation](tests/README.md) โ€” Running tests -- [Contributing Guide](CONTRIBUTING.md) โ€” Development workflow -- [Changelog](CHANGELOG.md) โ€” Version history - ---- - -## Contributing - -Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. - -### Quick Guide +## Development ```bash -# Fork and clone -git clone https://github.com/dronefreak/VisDrone-dataset-python-toolkit.git - -# Setup dev environment -make setup-venv && source venv/bin/activate -make install-dev - -# Make changes and test -make format lint test - -# Submit PR -git checkout -b feature/your-feature -git commit -m "Add feature" -git push origin feature/your-feature +make format lint test # format + lint + run tests +python -m pytest # 203 tests, ~63% coverage ``` +Pre-commit hooks: Black, Ruff, isort, mypy. + --- ## Citation -If you use this toolkit, please cite: - ```bibtex @misc{visdrone_toolkit_2025, author = {Saksena, Saumya Kumaar}, - title = {VisDrone Toolkit 2.0: Modern PyTorch Implementation}, - year = {2025}, - publisher = {GitHub}, - url = {https://github.com/dronefreak/VisDrone-dataset-python-toolkit} + title = {VisDrone Toolkit 2.0}, + year = {2025}, + url = {https://github.com/dronefreak/VisDrone-dataset-python-toolkit} } -``` - -Original VisDrone dataset: -```bibtex @article{zhu2018visdrone, - title={Vision Meets Drones: A Challenge}, - author={Zhu, Pengfei and Wen, Longyin and Bian, Xiao and Ling, Haibin and Hu, Qinghua}, - journal={arXiv preprint arXiv:1804.07437}, - year={2018} + title = {Vision Meets Drones: A Challenge}, + author = {Zhu, Pengfei and Wen, Longyin and Bian, Xiao and Ling, Haibin and Hu, Qinghua}, + journal = {arXiv preprint arXiv:1804.07437}, + year = {2018} } ``` --- -## License - -Apache License 2.0 โ€” see [LICENSE](LICENSE) - ---- - -## Acknowledgments - -- **VisDrone Team** for the dataset -- **PyTorch & Torchvision** for the framework -- All contributors to this project - ---- - -## Roadmap - -- [x] Advanced training features (augmentation, multi-scale, gradient accumulation) -- [x] TTA and Soft-NMS post-processing -- [x] Rich progress tracking with metrics -- [ ] VisDrone video task support -- [ ] Weights & Biases integration -- [ ] TensorRT optimization -- [ ] Docker deployment -- [x] YOLO v8, v9, v10, YOLO11, YOLO26 architectures (29 variants) -- [ ] DETR architecture -- [ ] Mobile deployment guide - ---- - -**Project Stats:** v2.0.0 | Python 3.8+ | PyTorch 2.0+ | 66 tests | >80% coverage | 66.7% F1 on VisDrone - -**Issues & Support:** [GitHub Issues](https://github.com/dronefreak/VisDrone-dataset-python-toolkit/issues) +[Changelog](CHANGELOG.md) ยท [Issues](https://github.com/dronefreak/VisDrone-dataset-python-toolkit/issues) ยท Apache 2.0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8430cd2..9474f13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff black isort pydocstyle mypy bandit[toml] + pip install ruff==0.1.15 black isort pydocstyle mypy bandit[toml] pip install types-PyYAML types-setuptools pip install -e . diff --git a/scripts/inference.py b/scripts/inference.py index 3d889fd..a226d26 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -34,7 +34,7 @@ import numpy as np import torch -from visdrone_toolkit.utils import VISDRONE_CLASSES, get_model +from visdrone_toolkit.utils import YOLO_CLASS_COLORS, YOLO_CLASSES, get_model _IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"} _VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov", ".mkv", ".webm"} @@ -104,7 +104,7 @@ def run_yolo( conf=score_threshold, device=device, imgsz=imgsz, - save=False, + save=True, verbose=True, ) @@ -122,13 +122,7 @@ def run_yolo( labels = result.boxes.cls.cpu().numpy().astype(int) # Custom visualization - viz = draw_detections( - frame, - boxes, - scores, - labels, - VISDRONE_CLASSES, - ) + viz = draw_detections(frame, boxes, scores, labels, YOLO_CLASSES, YOLO_CLASS_COLORS) # Save image_path = Path(result.path) @@ -246,6 +240,7 @@ def draw_detections( scores: np.ndarray, labels: np.ndarray, class_names: list[str], + class_colors: dict[int, tuple[int, int, int]], ) -> np.ndarray: """Draw bounding boxes and labels on a BGR frame.""" out = frame.copy() @@ -262,12 +257,15 @@ def draw_detections( for box, score, label in zip(boxes, scores, labels): x1, y1, x2, y2 = box.astype(int) + # Get class color + color = class_colors.get(label, (0, 255, 0)) + # Draw box cv2.rectangle( out, (x1, y1), (x2, y2), - (0, 255, 0), + color, box_thickness, ) @@ -282,13 +280,12 @@ def draw_detections( font_scale, font_thickness, ) - # Filled label background cv2.rectangle( out, (x1, y1 - th - baseline - 4), (x1 + tw + 4, y1), - (0, 255, 0), + color, -1, ) @@ -299,7 +296,7 @@ def draw_detections( (x1 + 2, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, font_scale, - (0, 0, 0), + (255, 255, 255), font_thickness, cv2.LINE_AA, ) @@ -337,14 +334,24 @@ def run_torchvision_images( if save_viz: viz = draw_detections( - frame, result["boxes"], result["scores"], result["labels"], VISDRONE_CLASSES + frame, + result["boxes"], + result["scores"], + result["labels"], + YOLO_CLASSES, + YOLO_CLASS_COLORS, ) out_path = output_dir / f"{image_path.stem}_pred.jpg" cv2.imwrite(str(out_path), viz) if show: viz = draw_detections( - frame, result["boxes"], result["scores"], result["labels"], VISDRONE_CLASSES + frame, + result["boxes"], + result["scores"], + result["labels"], + YOLO_CLASSES, + YOLO_CLASS_COLORS, ) cv2.imshow("VisDrone Inference", viz) if cv2.waitKey(0) == ord("q"): @@ -401,7 +408,12 @@ def run_torchvision_video( total_det += len(result["boxes"]) viz = draw_detections( - frame, result["boxes"], result["scores"], result["labels"], VISDRONE_CLASSES + frame, + result["boxes"], + result["scores"], + result["labels"], + YOLO_CLASSES, + YOLO_CLASS_COLORS, ) if writer is not None: diff --git a/scripts/webcam_demo.py b/scripts/webcam_demo.py index 4c4079e..fcd1e15 100644 --- a/scripts/webcam_demo.py +++ b/scripts/webcam_demo.py @@ -310,8 +310,7 @@ def main() -> None: cap.release() cv2.destroyAllWindows() print( - f"\nFrames: {frame_count} Saved: {saved_count} " - f"Avg FPS: {fps_counter.get_fps():.1f}" + f"\nFrames: {frame_count} Saved: {saved_count} Avg FPS: {fps_counter.get_fps():.1f}" ) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index ac9c505..464d88b 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -347,30 +347,75 @@ class TestInferenceDrawDetections: def test_draws_on_frame(self): from scripts.inference import draw_detections + YOLO_CLASS_COLORS = { + 0: (255, 0, 0), # pedestrian - red + 1: (255, 128, 0), # people - orange + 2: (255, 255, 0), # bicycle - yellow + 3: (0, 255, 0), # car - green + 4: (0, 255, 128), # van - light green + 5: (0, 255, 255), # truck - cyan + 6: (0, 128, 255), # tricycle - light blue + 7: (0, 0, 255), # awning-tricycle - blue + 8: (128, 0, 255), # bus - purple + 9: (255, 0, 255), # motor - magenta + 10: (255, 0, 128), # others - pink + } + frame = _make_image(100, 120) boxes = np.array([[5, 5, 30, 30]], dtype=np.float32) scores = np.array([0.9]) labels = np.array([1]) - result = draw_detections(frame, boxes, scores, labels, ["ignored", "pedestrian"]) + result = draw_detections( + frame, boxes, scores, labels, ["ignored", "pedestrian"], class_colors=YOLO_CLASS_COLORS + ) assert result.shape == frame.shape def test_empty_detections(self): from scripts.inference import draw_detections + YOLO_CLASS_COLORS = { + 0: (255, 0, 0), # pedestrian - red + 1: (255, 128, 0), # people - orange + 2: (255, 255, 0), # bicycle - yellow + 3: (0, 255, 0), # car - green + 4: (0, 255, 128), # van - light green + 5: (0, 255, 255), # truck - cyan + 6: (0, 128, 255), # tricycle - light blue + 7: (0, 0, 255), # awning-tricycle - blue + 8: (128, 0, 255), # bus - purple + 9: (255, 0, 255), # motor - magenta + 10: (255, 0, 128), # others - pink + } frame = _make_image() - result = draw_detections(frame, np.zeros((0, 4)), np.array([]), np.array([]), []) + result = draw_detections( + frame, np.zeros((0, 4)), np.array([]), np.array([]), [], class_colors=YOLO_CLASS_COLORS + ) assert result.shape == frame.shape def test_label_out_of_range(self): from scripts.inference import draw_detections frame = _make_image() + YOLO_CLASS_COLORS = { + 0: (255, 0, 0), # pedestrian - red + 1: (255, 128, 0), # people - orange + 2: (255, 255, 0), # bicycle - yellow + 3: (0, 255, 0), # car - green + 4: (0, 255, 128), # van - light green + 5: (0, 255, 255), # truck - cyan + 6: (0, 128, 255), # tricycle - light blue + 7: (0, 0, 255), # awning-tricycle - blue + 8: (128, 0, 255), # bus - purple + 9: (255, 0, 255), # motor - magenta + 10: (255, 0, 128), # others - pink + } result = draw_detections( frame, np.array([[0, 0, 20, 20]], dtype=np.float32), np.array([0.8]), np.array([99]), ["only_one"], + class_colors=YOLO_CLASS_COLORS, ) assert result is not None diff --git a/visdrone_toolkit/utils.py b/visdrone_toolkit/utils.py index 276774d..aa486a1 100644 --- a/visdrone_toolkit/utils.py +++ b/visdrone_toolkit/utils.py @@ -40,6 +40,36 @@ "others", # 11 ] +# YOLO class names (exclude "ignored-regions" since YOLO doesn't support ignore labels) +YOLO_CLASSES = [ + "pedestrian", + "people", + "bicycle", + "car", + "van", + "truck", + "tricycle", + "awning-tricycle", + "bus", + "motor", + "others", +] + +# YOLO class colors (RGB) +YOLO_CLASS_COLORS = { + 0: (255, 0, 0), # pedestrian - red + 1: (255, 128, 0), # people - orange + 2: (255, 255, 0), # bicycle - yellow + 3: (0, 255, 0), # car - green + 4: (0, 255, 128), # van - light green + 5: (0, 255, 255), # truck - cyan + 6: (0, 128, 255), # tricycle - light blue + 7: (0, 0, 255), # awning-tricycle - blue + 8: (128, 0, 255), # bus - purple + 9: (255, 0, 255), # motor - magenta + 10: (255, 0, 128), # others - pink +} + # Number of classes (excluding background for torchvision models) NUM_CLASSES = len(VISDRONE_CLASSES)