Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ LOGGING_CONFIG_FILE=server/logging_dev.conf

# Groq API key for Whisper transcription
GROQ_API_KEY=your_groq_api_key_here

# OpenTelemetry (used when running with docker-compose.signoz.yml)
# conserver and api send traces/metrics to signoz-otel-collector when the SignOz stack is enabled
# OTEL_EXPORTER_OTLP_ENDPOINT=http://signoz-otel-collector:4318
# OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
# OTEL_TRACES_EXPORTER=otlp
# OTEL_METRICS_EXPORTER=otlp
# OTEL_LOGS_EXPORTER=otlp
# OTEL_SERVICE_NAME=conserver
158 changes: 158 additions & 0 deletions docker-compose.signoz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# SigNoz Observability Stack
# Usage: docker compose -f docker-compose.yml -f docker-compose.override.yml -f docker-compose.signoz.yml up -d
#
# After first run, execute schema migrations:
# docker run --rm --network conserver signoz/signoz-schema-migrator:latest sync --dsn='tcp://signoz-clickhouse:9000'
#
# Access UI at: http://localhost:3301

networks:
conserver:
external: true

volumes:
signoz_clickhouse_data:
signoz_zookeeper_data:
signoz_zookeeper_log:
signoz_data:

services:
signoz-zookeeper:
image: zookeeper:3.9
container_name: signoz-zookeeper
hostname: signoz-zookeeper
environment:
- ZOO_AUTOPURGE_PURGEINTERVAL=1
- ZOO_4LW_COMMANDS_WHITELIST=mntr,ruok,stat
volumes:
- signoz_zookeeper_data:/data
- signoz_zookeeper_log:/datalog
networks:
- conserver
healthcheck:
test: ["CMD-SHELL", "echo ruok | nc localhost 2181 | grep imok"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped

signoz-clickhouse:
image: clickhouse/clickhouse-server:24.1.2-alpine
container_name: signoz-clickhouse
hostname: signoz-clickhouse
tty: true
depends_on:
signoz-zookeeper:
condition: service_healthy
volumes:
- signoz_clickhouse_data:/var/lib/clickhouse
- ./signoz/zz-clickhouse-config.xml:/etc/clickhouse-server/config.d/zz-clickhouse-config.xml:ro
- ./signoz/clickhouse-users.xml:/etc/clickhouse-server/users.d/users.xml:ro
environment:
- CLICKHOUSE_DB=signoz_traces
- CLICKHOUSE_USER=default
- CLICKHOUSE_PASSWORD=
ulimits:
nofile:
soft: 262144
hard: 262144
networks:
- conserver
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8123/ping"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped

signoz-otel-collector:
image: signoz/signoz-otel-collector:latest
container_name: signoz-otel-collector
hostname: signoz-otel-collector
command:
- "--config=/etc/otel-collector-config.yaml"
depends_on:
signoz-clickhouse:
condition: service_healthy
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name=signoz-host,os.type=linux
volumes:
- ./signoz/otel-collector-config.yaml:/etc/otel-collector-config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC receiver
- "4318:4318" # OTLP HTTP receiver
networks:
- conserver
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:13133/"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped

signoz:
image: signoz/query-service:latest
container_name: signoz
hostname: signoz
depends_on:
signoz-clickhouse:
condition: service_healthy
environment:
- ClickHouseUrl=tcp://signoz-clickhouse:9000
- SIGNOZ_LOCAL_DB_PATH=/var/lib/signoz/signoz.db
- DASHBOARDS_PATH=/root/config/dashboards
- STORAGE=clickhouse
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TELEMETRY_ENABLED=true contradicts README fix for SigNoz crash

Medium Severity

TELEMETRY_ENABLED is set to true in the compose file, but the README explicitly documents that this causes the SigNoz query service to panic (nil pointer crash) because the telemetry cron checks TTL for signoz_logs.logs, a table that doesn't exist after schema migration. The README even states the fix (TELEMETRY_ENABLED=false) is "already set in this repo," but it isn't.

Additional Locations (1)

Fix in Cursor Fix in Web

- DEPLOYMENT_TYPE=docker-standalone
volumes:
- signoz_data:/var/lib/signoz
- ./signoz/dashboards:/root/config/dashboards
ports:
- "3301:8080" # Web UI
networks:
- conserver
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/api/v1/health"]
interval: 30s
timeout: 10s
retries: 3
restart: unless-stopped

# Override conserver and api to send traces/metrics to SignOz (OTLP HTTP)
conserver:
command: "opentelemetry-instrument python ./server/main.py"
environment:
OTEL_EXPORTER_OTLP_ENDPOINT: http://signoz-otel-collector:4318
OTEL_EXPORTER_OTLP_PROTOCOL: http/protobuf
OTEL_TRACES_EXPORTER: otlp
OTEL_METRICS_EXPORTER: otlp
OTEL_LOGS_EXPORTER: otlp
OTEL_SERVICE_NAME: conserver
depends_on:
signoz-otel-collector:
condition: service_healthy

api:
command: /bin/bash -c "opentelemetry-instrument uvicorn server.api:app --host 0.0.0.0 --port 8000"
environment:
OTEL_EXPORTER_OTLP_ENDPOINT: http://signoz-otel-collector:4318
OTEL_EXPORTER_OTLP_PROTOCOL: http/protobuf
OTEL_TRACES_EXPORTER: otlp
OTEL_METRICS_EXPORTER: otlp
OTEL_LOGS_EXPORTER: otlp
OTEL_SERVICE_NAME: conserver.api
depends_on:
signoz-otel-collector:
condition: service_healthy

logspout-signoz:
image: pavanputhra/logspout-signoz
container_name: logspout-signoz
restart: unless-stopped
volumes:
- /var/run/docker.sock:/var/run/docker.sock
environment:
SIGNOZ_LOG_ENDPOINT: http://172.17.0.1:8082
ENV: prod
command: signoz://172.17.0.1:8082
3 changes: 3 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ ENV VCON_SERVER_VERSION=${VCON_SERVER_VERSION}
ENV VCON_SERVER_GIT_COMMIT=${VCON_SERVER_GIT_COMMIT}
ENV VCON_SERVER_BUILD_TIME=${VCON_SERVER_BUILD_TIME}

# Configure apt to use HTTPS sources (required when HTTP port 80 is blocked)
RUN sed -i 's|http://deb.debian.org|https://deb.debian.org|g' /etc/apt/sources.list.d/debian.sources

RUN apt-get update && \
apt-get install -y libavdevice-dev ffmpeg

Expand Down
216 changes: 216 additions & 0 deletions docs/PERFORMANCE_TESTING.md
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Details are very specific to one env. We should remove it from open source repo.

Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
# Performance Testing Notes

**Last Updated:** 2026-02-02

## Test Environment

### Servers
- **Conserver (vcon-server)**: http://localhost:8080 (token: `mulliganmccarthy`)
- **vfun (transcription)**: http://localhost:4380/wtf

### NAS Storage

**Mount Point:** `/mnt/nas`
```
64.187.219.131:/mnt/slave_recording → /mnt/nas (NFS4)
- rsize/wsize: 1MB
- Protocol: TCP
- Hard mount with 600s timeout
```

**Directory Structure:**
```
/mnt/nas/
├── Freeswitch1/ # 20 Freeswitch servers (1-20)
│ ├── 2026-01-19/ # Date directories (15+ days available)
│ │ ├── 06/ # Hour directories (00-23)
│ │ │ └── *.wav # Recording files (~489k per day)
│ │ ├── 07/
│ │ └── ...
│ ├── 2026-01-20/
│ └── ...
├── Freeswitch2/
├── ...
├── Freeswitch20/
├── Batch1_recording/
├── pcaps_*/ # Packet captures
└── fs_collect_by_number.sh # Collection utility
```

**File Naming Pattern:**
```
{campaign}_{caller}_{callid}_{date}_{time}.wav
Example: 10508_12026661845_993317168030975_2026-01-19_06:47:08.wav

Fields:
- campaign: Campaign/extension ID (e.g., 10508, 6075, 9676)
- caller: Phone number (e.g., 12026661845)
- callid: Unique call ID (e.g., 993317168030975)
- date: YYYY-MM-DD
- time: HH:MM:SS
```

**Scale:**
- ~489,000 recordings per day per Freeswitch server
- ~9.78 million recordings/day across all 20 servers
- ~938 KB average file size (~60 seconds @ 8kHz 16-bit)
- ~9 TB/day of new recordings
- 15+ days of historical data
- Access requires `nasgroup` membership

---

## Performance Results (2026-02-02)

### Conserver API
| Metric | Value |
|--------|-------|
| Throughput | 151.68 req/s |
| Avg Latency | 57.22 ms |
| Success Rate | 100% |

### vfun Transcription (Local Files)
| Metric | Value |
|--------|-------|
| Throughput | 32.72 files/sec |
| Data Rate | 30.36 MB/sec |
| Peak GPU Utilization | 95% |

### vfun Transcription (NAS Files)
| Files | Workers | Throughput | Data Rate | Parallelism |
|-------|---------|------------|-----------|-------------|
| 100 | 32 | 48.40 files/sec | 34.08 MB/s | 25.9x |
| 200 | 64 | 45.60 files/sec | 30.92 MB/s | 47.9x |
| 500 | 64 | 43.63 files/sec | 30.85 MB/s | 59.4x |

### Full Pipeline (NAS → vfun → vCon → Conserver → vcon-mcp)
| Files | Workers | Throughput | vCons Stored | Success |
|-------|---------|------------|--------------|---------|
| 50 | 16 | 2,447 files/min | 35 | 100% |
| 500 | 48 | 2,576 files/min | 362 | 100% |
| 1,000 | 64 | **2,973 files/min** | 703 | 100% |

**Full Pipeline Capacity (single vfun instance):**
- ~3,000 files/min = **~4.3 million files/day**
- vCon creation adds minimal overhead (~1ms per vCon)
- Conserver chain processing: ~10ms per vCon
- Webhook to vcon-mcp (Supabase): ~100-200ms per vCon

**Key Findings:**
- NAS network storage does not bottleneck transcription
- GPU batching works efficiently (59.4x parallelism vs 64x max)
- Sustained ~44-48 files/sec with high concurrency
- 100% success rate across 1,500+ files
- Full pipeline maintains ~48 files/sec throughput

### vfun Batching Configuration
```
GPU_MAX_BATCH_SIZE = 64
GPU_COALESCE_TIMEOUT_US = 5000 (5ms)
GPU_COALESCE_MIN_FILL = 16
```

---

## Test Scripts

Located in `scripts/`:
- `nas_transcription_pipeline.py` - **Production pipeline** with vCon creation and storage
- `nas_stress_test.py` - High-concurrency vfun stress test with NAS files

### Running Tests

```bash
# Check servers
curl -s http://localhost:8080/docs | head -5
curl -s http://localhost:4380/ready

# Start vfun if needed
cd ~/strolid/vfun && ./vfun --port 4380

# Run vfun-only stress test
python3 scripts/nas_stress_test.py 200 64

# Run full pipeline (transcription + vCon storage)
python3 scripts/nas_transcription_pipeline.py --date 2026-01-19 --hour 06 --limit 500 --workers 48 --store-vcons

# Dry run to see file counts
python3 scripts/nas_transcription_pipeline.py --date 2026-01-19 --dry-run
```

### Pipeline Chain Configuration
```
main_chain: ingress:default → tag → expire_vcon → egress:processed → storage: supabase_webhook
transcription: ingress:transcribe → tag → wtf_transcribe → keyword_tagger → expire_vcon → egress:transcribed → storage: supabase_webhook
```
Note: `supabase_webhook` runs as a post-chain storage (parallel, non-blocking) via `storage.webhook` module.

---

## vfun Stability Issues (CUDA Crashes)

### Root Cause Analysis (2026-02-02)

**Problem:** vfun crashes intermittently after processing hundreds of files under sustained load.

**Investigation findings:**
1. **NOT the NAS** - Files read correctly, NAS performance is stable
2. **NOT memory leaks** - GPU memory stable at ~12.6GB throughout processing
3. **NOT single file issues** - Crash-causing files process fine individually
4. **IS a CUDA batching issue** - Specific batch combinations trigger cuBLAS failures

**Error signature:**
```
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul
with transpose_mat1 1 transpose_mat2 0 m 1024 n 251 k 1024
```

**What happens:**
1. Under high concurrency, vfun batches audio files for GPU processing
2. Certain combinations of audio lengths create tensor dimensions that trigger cuBLAS matrix multiplication failures
3. The CUDA error corrupts GPU state, leaving vfun hung (process exists but unresponsive to `/ready` endpoint)
4. GPU memory shows 0 MiB used after crash (resources released but process not terminated)

**Affected dimensions:** The `n=251` parameter in the error suggests certain audio sequence lengths cause problematic matrix sizes during the transformer decoder forward pass.

### Workarounds for Production

**1. Auto-restart script:**
```bash
#!/bin/bash
# Run pipeline with automatic vfun restart on crash
restart_vfun() {
pkill -9 -f "vfun --port 4380"
sleep 2
cd ~/strolid/vfun && ./vfun --port 4380 > /tmp/vfun.log 2>&1 &
sleep 10
}

# Check health every 30 seconds, restart if hung
while true; do
if ! curl -s --max-time 5 http://localhost:4380/ready > /dev/null 2>&1; then
echo "$(date) - vfun crash detected, restarting..."
restart_vfun
fi
sleep 30
done
```

**2. Reduce concurrency** (may reduce throughput but fewer crashes):
- Try 32-48 workers instead of 64
- Smaller batches reduce likelihood of problematic tensor dimensions

**3. Batch processing with checkpoints:**
- Process in batches of 2000-3000 files
- Restart vfun between batches preventively
- Track progress in checkpoint files

### Investigation Scripts

Located in `scripts/`:
- `find_bad_file.py` - Tests files sequentially to identify crash point
- `run_pipeline_with_restart.sh` - Pipeline with auto-restart capability

### Logs to Check
- `/tmp/vfun.log` or `/tmp/vfun_test.log` - vfun stdout/stderr including CUDA errors
- Pipeline logs show last successful file before crash
Loading
Loading