vcon-dev · howethomas · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/.env.example b/.env.example
@@ -12,3 +12,12 @@ LOGGING_CONFIG_FILE=server/logging_dev.conf
 
 # Groq API key for Whisper transcription
 GROQ_API_KEY=your_groq_api_key_here
+
+# OpenTelemetry (used when running with docker-compose.signoz.yml)
+# conserver and api send traces/metrics to signoz-otel-collector when the SignOz stack is enabled
+# OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector:4318
+# OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
+# OTEL_TRACES_EXPORTER=otlp
+# OTEL_METRICS_EXPORTER=otlp
+# OTEL_LOGS_EXPORTER=otlp
+# OTEL_SERVICE_NAME=conserver
diff --git a/docker-compose.signoz.yml b/docker-compose.signoz.yml
@@ -0,0 +1,158 @@
+# SigNoz Observability Stack
+# Usage: docker compose -f docker-compose.yml -f docker-compose.override.yml -f docker-compose.signoz.yml up -d
+#
+# After first run, execute schema migrations:
+#   docker run --rm --network conserver signoz/signoz-schema-migrator:latest sync --dsn='tcp://signoz-clickhouse:9000'
+#
+# Access UI at: http://localhost:3301
+
+networks:
+  conserver:
+    external: true
+
+volumes:
+  signoz_clickhouse_data:
+  signoz_zookeeper_data:
+  signoz_zookeeper_log:
+  signoz_data:
+
+services:
+  signoz-zookeeper:
+    image: zookeeper:3.9
+    container_name: signoz-zookeeper
+    hostname: signoz-zookeeper
+    environment:
+      - ZOO_AUTOPURGE_PURGEINTERVAL=1
+      - ZOO_4LW_COMMANDS_WHITELIST=mntr,ruok,stat
+    volumes:
+      - signoz_zookeeper_data:/data
+      - signoz_zookeeper_log:/datalog
+    networks:
+      - conserver
+    healthcheck:
+      test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    restart: unless-stopped
+
+  signoz-clickhouse:
+    image: clickhouse/clickhouse-server:24.1.2-alpine
+    container_name: signoz-clickhouse
+    hostname: signoz-clickhouse
+    tty: true
+    depends_on:
+      signoz-zookeeper:
+        condition: service_healthy
+    volumes:
+      - signoz_clickhouse_data:/var/lib/clickhouse
+      - ./signoz/zz-clickhouse-config.xml:/etc/clickhouse-server/config.d/zz-clickhouse-config.xml:ro
+      - ./signoz/clickhouse-users.xml:/etc/clickhouse-server/users.d/users.xml:ro
+    environment:
+      - CLICKHOUSE_DB=signoz_traces
+      - CLICKHOUSE_USER=default
+      - CLICKHOUSE_PASSWORD=
+    ulimits:
+      nofile:
+        soft: 262144
+        hard: 262144
+    networks:
+      - conserver
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8123/ping"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    restart: unless-stopped
+
+  signoz-otel-collector:
+    image: signoz/signoz-otel-collector:latest
+    container_name: signoz-otel-collector
+    hostname: signoz-otel-collector
+    command:
+      - "--config=/etc/otel-collector-config.yaml"
+    depends_on:
+      signoz-clickhouse:
+        condition: service_healthy
+    environment:
+      - OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
+    volumes:
+      - ./signoz/otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
+    ports:
+      - "4317:4317"   # OTLP gRPC receiver
+      - "4318:4318"   # OTLP HTTP receiver
+    networks:
+      - conserver
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:13133/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    restart: unless-stopped
+
+  signoz:
+    image: signoz/query-service:latest
+    container_name: signoz
+    hostname: signoz
+    depends_on:
+      signoz-clickhouse:
+        condition: service_healthy
+    environment:
+      - ClickHouseUrl=tcp://signoz-clickhouse:9000
+      - SIGNOZ_LOCAL_DB_PATH=/var/lib/signoz/signoz.db
+      - DASHBOARDS_PATH=/root/config/dashboards
+      - STORAGE=clickhouse
+      - GODEBUG=netdns=go
+      - TELEMETRY_ENABLED=true
+      - DEPLOYMENT_TYPE=docker-standalone
+    volumes:
+      - signoz_data:/var/lib/signoz
+      - ./signoz/dashboards:/root/config/dashboards
+    ports:
+      - "3301:8080"   # Web UI
+    networks:
+      - conserver
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/api/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    restart: unless-stopped
+
+  # Override conserver and api to send traces/metrics to SignOz (OTLP HTTP)
+  conserver:
+    command: "opentelemetry-instrument python ./server/main.py"
+    environment:
+      OTEL_EXPORTER_OTLP_ENDPOINT: http://signoz-otel-collector:4318
+      OTEL_EXPORTER_OTLP_PROTOCOL: http/protobuf
+      OTEL_TRACES_EXPORTER: otlp
+      OTEL_METRICS_EXPORTER: otlp
+      OTEL_LOGS_EXPORTER: otlp
+      OTEL_SERVICE_NAME: conserver
+    depends_on:
+      signoz-otel-collector:
+        condition: service_healthy
+
+  api:
+    command: /bin/bash -c "opentelemetry-instrument uvicorn server.api:app --host 0.0.0.0 --port 8000"
+    environment:
+      OTEL_EXPORTER_OTLP_ENDPOINT: http://signoz-otel-collector:4318
+      OTEL_EXPORTER_OTLP_PROTOCOL: http/protobuf
+      OTEL_TRACES_EXPORTER: otlp
+      OTEL_METRICS_EXPORTER: otlp
+      OTEL_LOGS_EXPORTER: otlp
+      OTEL_SERVICE_NAME: conserver.api
+    depends_on:
+      signoz-otel-collector:
+        condition: service_healthy
+
+  logspout-signoz:
+      image: pavanputhra/logspout-signoz
+      container_name: logspout-signoz
+      restart: unless-stopped
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+      environment:
+        SIGNOZ_LOG_ENDPOINT: http://172.17.0.1:8082
+        ENV: prod
+      command: signoz://172.17.0.1:8082
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -10,6 +10,9 @@ ENV VCON_SERVER_VERSION=${VCON_SERVER_VERSION}
 ENV VCON_SERVER_GIT_COMMIT=${VCON_SERVER_GIT_COMMIT}
 ENV VCON_SERVER_BUILD_TIME=${VCON_SERVER_BUILD_TIME}
 
+# Configure apt to use HTTPS sources (required when HTTP port 80 is blocked)
+RUN sed -i 's|http://deb.debian.org|https://deb.debian.org|g' /etc/apt/sources.list.d/debian.sources
+
 RUN apt-get update && \
     apt-get install -y libavdevice-dev ffmpeg
 

diff --git a/docs/PERFORMANCE_TESTING.md b/docs/PERFORMANCE_TESTING.md
@@ -0,0 +1,216 @@
+# Performance Testing Notes
+
+**Last Updated:** 2026-02-02
+
+## Test Environment
+
+### Servers
+- **Conserver (vcon-server)**: http://localhost:8080 (token: `mulliganmccarthy`)
+- **vfun (transcription)**: http://localhost:4380/wtf
+
+### NAS Storage
+
+**Mount Point:** `/mnt/nas`
+```
+64.187.219.131:/mnt/slave_recording → /mnt/nas (NFS4)
+- rsize/wsize: 1MB
+- Protocol: TCP
+- Hard mount with 600s timeout
+```
+
+**Directory Structure:**
+```
+/mnt/nas/
+├── Freeswitch1/              # 20 Freeswitch servers (1-20)
+│   ├── 2026-01-19/           # Date directories (15+ days available)
+│   │   ├── 06/               # Hour directories (00-23)
+│   │   │   └── *.wav         # Recording files (~489k per day)
+│   │   ├── 07/
+│   │   └── ...
+│   ├── 2026-01-20/
+│   └── ...
+├── Freeswitch2/
+├── ...
+├── Freeswitch20/
+├── Batch1_recording/
+├── pcaps_*/                  # Packet captures
+└── fs_collect_by_number.sh   # Collection utility
+```
+
+**File Naming Pattern:**
+```
+{campaign}_{caller}_{callid}_{date}_{time}.wav
+Example: 10508_12026661845_993317168030975_2026-01-19_06:47:08.wav
+
+Fields:
+- campaign: Campaign/extension ID (e.g., 10508, 6075, 9676)
+- caller: Phone number (e.g., 12026661845)
+- callid: Unique call ID (e.g., 993317168030975)
+- date: YYYY-MM-DD
+- time: HH:MM:SS
+```
+
+**Scale:**
+- ~489,000 recordings per day per Freeswitch server
+- ~9.78 million recordings/day across all 20 servers
+- ~938 KB average file size (~60 seconds @ 8kHz 16-bit)
+- ~9 TB/day of new recordings
+- 15+ days of historical data
+- Access requires `nasgroup` membership
+
+---
+
+## Performance Results (2026-02-02)
+
+### Conserver API
+| Metric | Value |
+|--------|-------|
+| Throughput | 151.68 req/s |
+| Avg Latency | 57.22 ms |
+| Success Rate | 100% |
+
+### vfun Transcription (Local Files)
+| Metric | Value |
+|--------|-------|
+| Throughput | 32.72 files/sec |
+| Data Rate | 30.36 MB/sec |
+| Peak GPU Utilization | 95% |
+
+### vfun Transcription (NAS Files)
+| Files | Workers | Throughput | Data Rate | Parallelism |
+|-------|---------|------------|-----------|-------------|
+| 100 | 32 | 48.40 files/sec | 34.08 MB/s | 25.9x |
+| 200 | 64 | 45.60 files/sec | 30.92 MB/s | 47.9x |
+| 500 | 64 | 43.63 files/sec | 30.85 MB/s | 59.4x |
+
+### Full Pipeline (NAS → vfun → vCon → Conserver → vcon-mcp)
+| Files | Workers | Throughput | vCons Stored | Success |
+|-------|---------|------------|--------------|---------|
+| 50 | 16 | 2,447 files/min | 35 | 100% |
+| 500 | 48 | 2,576 files/min | 362 | 100% |
+| 1,000 | 64 | **2,973 files/min** | 703 | 100% |
+
+**Full Pipeline Capacity (single vfun instance):**
+- ~3,000 files/min = **~4.3 million files/day**
+- vCon creation adds minimal overhead (~1ms per vCon)
+- Conserver chain processing: ~10ms per vCon
+- Webhook to vcon-mcp (Supabase): ~100-200ms per vCon
+
+**Key Findings:**
+- NAS network storage does not bottleneck transcription
+- GPU batching works efficiently (59.4x parallelism vs 64x max)
+- Sustained ~44-48 files/sec with high concurrency
+- 100% success rate across 1,500+ files
+- Full pipeline maintains ~48 files/sec throughput
+
+### vfun Batching Configuration
+```
+GPU_MAX_BATCH_SIZE = 64
+GPU_COALESCE_TIMEOUT_US = 5000 (5ms)
+GPU_COALESCE_MIN_FILL = 16
+```
+
+---
+
+## Test Scripts
+
+Located in `scripts/`:
+- `nas_transcription_pipeline.py` - **Production pipeline** with vCon creation and storage
+- `nas_stress_test.py` - High-concurrency vfun stress test with NAS files
+
+### Running Tests
+
+```bash
+# Check servers
+curl -s http://localhost:8080/docs | head -5
+curl -s http://localhost:4380/ready
+
+# Start vfun if needed
+cd ~/strolid/vfun && ./vfun --port 4380
+
+# Run vfun-only stress test
+python3 scripts/nas_stress_test.py 200 64
+
+# Run full pipeline (transcription + vCon storage)
+python3 scripts/nas_transcription_pipeline.py --date 2026-01-19 --hour 06 --limit 500 --workers 48 --store-vcons
+
+# Dry run to see file counts
+python3 scripts/nas_transcription_pipeline.py --date 2026-01-19 --dry-run
+```
+
+### Pipeline Chain Configuration
+```
+main_chain:     ingress:default → tag → expire_vcon → egress:processed → storage: supabase_webhook
+transcription:  ingress:transcribe → tag → wtf_transcribe → keyword_tagger → expire_vcon → egress:transcribed → storage: supabase_webhook
+```
+Note: `supabase_webhook` runs as a post-chain storage (parallel, non-blocking) via `storage.webhook` module.
+
+---
+
+## vfun Stability Issues (CUDA Crashes)
+
+### Root Cause Analysis (2026-02-02)
+
+**Problem:** vfun crashes intermittently after processing hundreds of files under sustained load.
+
+**Investigation findings:**
+1. **NOT the NAS** - Files read correctly, NAS performance is stable
+2. **NOT memory leaks** - GPU memory stable at ~12.6GB throughout processing
+3. **NOT single file issues** - Crash-causing files process fine individually
+4. **IS a CUDA batching issue** - Specific batch combinations trigger cuBLAS failures
+
+**Error signature:**
+```
+RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul
+with transpose_mat1 1 transpose_mat2 0 m 1024 n 251 k 1024
+```
+
+**What happens:**
+1. Under high concurrency, vfun batches audio files for GPU processing
+2. Certain combinations of audio lengths create tensor dimensions that trigger cuBLAS matrix multiplication failures
+3. The CUDA error corrupts GPU state, leaving vfun hung (process exists but unresponsive to `/ready` endpoint)
+4. GPU memory shows 0 MiB used after crash (resources released but process not terminated)
+
+**Affected dimensions:** The `n=251` parameter in the error suggests certain audio sequence lengths cause problematic matrix sizes during the transformer decoder forward pass.
+
+### Workarounds for Production
+
+**1. Auto-restart script:**
+```bash
+#!/bin/bash
+# Run pipeline with automatic vfun restart on crash
+restart_vfun() {
+    pkill -9 -f "vfun --port 4380"
+    sleep 2
+    cd ~/strolid/vfun && ./vfun --port 4380 > /tmp/vfun.log 2>&1 &
+    sleep 10
+}
+
+# Check health every 30 seconds, restart if hung
+while true; do
+    if ! curl -s --max-time 5 http://localhost:4380/ready > /dev/null 2>&1; then
+        echo "$(date) - vfun crash detected, restarting..."
+        restart_vfun
+    fi
+    sleep 30
+done
+```
+
+**2. Reduce concurrency** (may reduce throughput but fewer crashes):
+- Try 32-48 workers instead of 64
+- Smaller batches reduce likelihood of problematic tensor dimensions
+
+**3. Batch processing with checkpoints:**
+- Process in batches of 2000-3000 files
+- Restart vfun between batches preventively
+- Track progress in checkpoint files
+
+### Investigation Scripts
+
+Located in `scripts/`:
+- `find_bad_file.py` - Tests files sequentially to identify crash point
+- `run_pipeline_with_restart.sh` - Pipeline with auto-restart capability
+
+### Logs to Check
+- `/tmp/vfun.log` or `/tmp/vfun_test.log` - vfun stdout/stderr including CUDA errors
+- Pipeline logs show last successful file before crash