Skip to content

Commit ea30478

Browse files
Merge pull request #79 from RSS-Engineering/sai/PLATEXP-11106
feat(connectivity_check): add automated monitoring with direct Datadog integration
2 parents a6ac8c5 + 06e9ff6 commit ea30478

13 files changed

Lines changed: 401 additions & 41 deletions

File tree

.github/workflows/build-lambda.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Build Connectivity Check Lambda
2+
3+
on:
4+
push:
5+
paths:
6+
- 'modules/connectivity_check/lambda/**'
7+
- 'modules/connectivity_check/scripts/build-lambda.sh'
8+
workflow_dispatch:
9+
10+
jobs:
11+
build:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Setup Node.js
17+
uses: actions/setup-node@v4
18+
with:
19+
node-version: '22'
20+
registry-url: 'https://npm.pkg.github.com'
21+
scope: '@racker'
22+
23+
- name: Build Lambda package
24+
env:
25+
JANUS_GITHUB_PAT: ${{ secrets.JANUS_GITHUB_PAT }}
26+
run: |
27+
chmod +x modules/connectivity_check/scripts/build-lambda.sh
28+
modules/connectivity_check/scripts/build-lambda.sh
29+
30+
- name: Commit lambda.zip if changed
31+
run: |
32+
git config user.name "github-actions[bot]"
33+
git config user.email "github-actions[bot]@users.noreply.github.com"
34+
git add modules/connectivity_check/lambda.zip
35+
git diff --staged --quiet || git commit -m "chore(connectivity_check): rebuild lambda.zip [skip ci]"
36+
git push

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,9 @@ modules/lambda-layer-deps/examples/*/builds
88
modules/lambda-layer-deps/examples/*/node_modules
99
modules/lambda-layer-deps/examples/*/package.log
1010
.DS_Store
11+
12+
# Ignore compiled JavaScript in lambda source directories (TypeScript compiles to JS during build)
13+
modules/connectivity_check/lambda/index.js
14+
15+
# Allow pre-built Lambda packages
16+
!modules/connectivity_check/lambda.zip
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Connectivity Check Module
2+
3+
Terraform module for deploying a Lambda function that tests TCP and HTTPS connectivity to specified endpoints and publishes metrics directly to Datadog.
4+
5+
## Features
6+
7+
- **ES Modules**: Lambda uses Node.js 22 with ES modules for modern JavaScript support
8+
- **Direct Datadog Integration**: Metrics sent directly via `@racker/janus-core` stats (no OpenTelemetry layer)
9+
- **Multi-region Support**: Designed to run in multiple regions with proper region tagging
10+
- **Parallel Execution**: Tests all endpoints concurrently for fast execution
11+
- **Flexible Protocols**: Supports TCP, HTTP, and HTTPS connectivity checks
12+
13+
## Lambda Package
14+
15+
The Lambda function uses a pre-built package (`lambda.zip`) that includes:
16+
- Compiled TypeScript (ES modules)
17+
- `@racker/janus-core` dependency for Datadog metrics
18+
- All required npm dependencies
19+
20+
This approach ensures the module works across all consuming repositories without requiring npm authentication during terraform apply.
21+
22+
### Rebuilding the Lambda Package
23+
24+
If you modify the Lambda code or dependencies:
25+
26+
```bash
27+
cd modules/connectivity_check
28+
./scripts/build-lambda.sh
29+
```
30+
31+
The GitHub Actions workflow will automatically rebuild the package when changes are pushed to the `lambda/` directory.
32+
33+
## Usage
34+
35+
```hcl
36+
module "connectivity_check" {
37+
source = "git@github.com:RSS-Engineering/terraform//modules/connectivity_check?ref=<commit-sha>"
38+
39+
function_name = "connectivity-check-primary"
40+
subnet_ids = ["subnet-xxx", "subnet-yyy"]
41+
security_group_ids = ["sg-xxx"]
42+
43+
enable_monitoring = true
44+
monitoring_schedule = "rate(1 minute)"
45+
monitoring_targets = [
46+
{
47+
host = "example.com"
48+
port = 443
49+
protocol = "https"
50+
critical = true
51+
}
52+
]
53+
54+
datadog_api_key = var.datadog_api_key
55+
environment = var.environment
56+
metric_tags = "service:janus,team:platform,region:us-west-2"
57+
}
58+
```
59+
60+
## Requirements
61+
62+
- Terraform >= 1.0
63+
- AWS Provider >= 5.0
64+
65+
## Inputs
66+
67+
| Name | Description | Type | Default | Required |
68+
|------|-------------|------|---------|----------|
69+
| function_name | Name of the Lambda function | string | - | yes |
70+
| subnet_ids | List of subnet IDs for Lambda | list(string) | - | yes |
71+
| security_group_ids | List of security group IDs | list(string) | - | yes |
72+
| enable_monitoring | Enable scheduled monitoring | bool | false | no |
73+
| monitoring_schedule | EventBridge schedule expression | string | "rate(1 minute)" | no |
74+
| monitoring_targets | List of endpoints to monitor | list(object) | [] | no |
75+
| datadog_api_key | Datadog API key | string | "" | no |
76+
| environment | Environment name | string | "unknown" | no |
77+
| metric_tags | Additional tags for Datadog metrics (comma-separated) | string | "" | no |
78+
| timeout | Lambda timeout in seconds | number | 60 | no |
79+
| memory_size | Lambda memory in MB | number | 128 | no |
80+
| log_retention_days | CloudWatch log retention | number | 30 | no |
81+
82+
## Outputs
83+
84+
| Name | Description |
85+
|------|-------------|
86+
| lambda_function_arn | ARN of the Lambda function |
87+
| lambda_function_name | Name of the Lambda function |
88+
| lambda_role_arn | ARN of the Lambda IAM role |
23.6 MB
Binary file not shown.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
@racker:registry=https://npm.pkg.github.com
2+
//npm.pkg.github.com/:_authToken=${JANUS_GITHUB_PAT}

modules/connectivity_check/lambda/index.js

Lines changed: 0 additions & 1 deletion
This file was deleted.

modules/connectivity_check/lambda/handler.ts renamed to modules/connectivity_check/lambda/index.ts

Lines changed: 108 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1-
import { Socket } from 'node:net';
2-
import { lookup } from 'node:dns/promises';
1+
import { Socket } from 'net';
2+
import { lookup } from 'dns/promises';
3+
// @ts-ignore - janus-core doesn't have proper ES module type declarations
4+
import stats from '@racker/janus-core/lib/stats/index.js';
5+
// @ts-ignore - janus-core doesn't have proper ES module type declarations
6+
import log from '@racker/janus-core/lib/log.js';
37

48
interface TestTarget {
59
host: string;
610
port: number;
711
protocol: 'tcp' | 'http' | 'https';
812
path?: string;
13+
critical?: boolean;
914
}
1015

1116
interface TestResult {
@@ -18,38 +23,75 @@ interface TestResult {
1823
error?: string;
1924
errorCode?: string;
2025
httpStatus?: number;
26+
critical?: boolean;
2127
}
2228

2329
interface LambdaEvent {
2430
targets: TestTarget[];
2531
}
2632

2733
export const handler = async (event: LambdaEvent): Promise<TestResult[]> => {
28-
console.log(
29-
'Testing connectivity for targets:',
30-
JSON.stringify(event.targets)
31-
);
32-
33-
const results: TestResult[] = [];
34-
35-
for (const target of event.targets) {
36-
if (target.protocol === 'tcp') {
37-
results.push(await testTcp(target));
38-
} else if (target.protocol === 'http' || target.protocol === 'https') {
39-
results.push(await testHttp(target));
40-
} else {
41-
results.push({
42-
host: target.host,
43-
port: target.port,
44-
protocol: target.protocol,
45-
success: false,
46-
error: `Unsupported protocol: ${target.protocol}`,
47-
});
48-
}
49-
}
34+
const env = process.env.ENVIRONMENT || 'unknown';
35+
36+
log.initialize('connectivity-check', {
37+
level: env === 'local' ? 'debug' : 'info'
38+
});
39+
40+
try {
41+
// Parse additional metric tags from environment variable
42+
const additionalTags = process.env.METRIC_TAGS
43+
? process.env.METRIC_TAGS.split(',').map(tag => tag.trim())
44+
: [];
45+
46+
// Initialize Datadog stats
47+
await stats.initializeWithDriver('http', 'connectivity.', {
48+
defaultTags: [
49+
`env:${env}`,
50+
'service:connectivity-check',
51+
...additionalTags
52+
],
53+
mock: ['local', 'test'].includes(env)
54+
});
55+
56+
log.info(
57+
{ targetCount: event.targets.length },
58+
'Testing connectivity for targets'
59+
);
60+
61+
// Run all connectivity checks in parallel for faster execution
62+
const checkPromises = event.targets.map(async (target) => {
63+
let result: TestResult;
64+
65+
if (target.protocol === 'tcp') {
66+
result = await testTcp(target);
67+
} else if (target.protocol === 'http' || target.protocol === 'https') {
68+
result = await testHttp(target);
69+
} else {
70+
result = {
71+
host: target.host,
72+
port: target.port,
73+
protocol: target.protocol,
74+
success: false,
75+
error: `Unsupported protocol: ${target.protocol}`,
76+
critical: target.critical,
77+
};
78+
}
79+
80+
return result;
81+
});
82+
83+
// Wait for all checks to complete
84+
const results = await Promise.all(checkPromises);
85+
86+
// Publish metrics to Datadog for all results
87+
results.forEach(result => publishMetrics(result));
88+
89+
log.info({ successCount: results.filter(r => r.success).length, totalCount: results.length }, 'Connectivity check complete');
5090

51-
console.log('Results:', JSON.stringify(results));
52-
return results;
91+
return results;
92+
} finally {
93+
await stats.close();
94+
}
5395
};
5496

5597
function isIpAddress(host: string): boolean {
@@ -101,6 +143,7 @@ async function testTcp(target: TestTarget): Promise<TestResult> {
101143
success: false,
102144
error: `DNS resolution failed: ${dnsResult.error}`,
103145
errorCode: dnsResult.errorCode,
146+
critical: target.critical,
104147
};
105148
}
106149

@@ -119,6 +162,7 @@ async function testTcp(target: TestTarget): Promise<TestResult> {
119162
success: true,
120163
resolvedIp: dnsResult.ip,
121164
latencyMs: Date.now() - start,
165+
critical: target.critical,
122166
});
123167
});
124168

@@ -132,6 +176,7 @@ async function testTcp(target: TestTarget): Promise<TestResult> {
132176
resolvedIp: dnsResult.ip,
133177
error: 'Connection timeout (5s)',
134178
errorCode: 'ETIMEDOUT',
179+
critical: target.critical,
135180
});
136181
});
137182

@@ -145,6 +190,7 @@ async function testTcp(target: TestTarget): Promise<TestResult> {
145190
resolvedIp: dnsResult.ip,
146191
error: err.message,
147192
errorCode: err.code,
193+
critical: target.critical,
148194
});
149195
});
150196

@@ -165,6 +211,7 @@ async function testHttp(target: TestTarget): Promise<TestResult> {
165211
success: false,
166212
error: `DNS resolution failed: ${dnsResult.error}`,
167213
errorCode: dnsResult.errorCode,
214+
critical: target.critical,
168215
};
169216
}
170217

@@ -180,14 +227,14 @@ async function testHttp(target: TestTarget): Promise<TestResult> {
180227

181228
// Log response details
182229
const headers = Object.fromEntries(response.headers.entries());
183-
console.log(`Response headers for ${url}:`, JSON.stringify(headers));
230+
log.debug({ url, headers }, 'HTTP response headers');
184231

185232
const bodyText = await response.text();
186233
const truncatedBody =
187234
bodyText.length > 100
188235
? bodyText.substring(0, 100) + '...(truncated)'
189236
: bodyText;
190-
console.log(`Response body for ${url}:`, truncatedBody);
237+
log.debug({ url, body: truncatedBody }, 'HTTP response body');
191238

192239
return {
193240
host: target.host,
@@ -197,6 +244,7 @@ async function testHttp(target: TestTarget): Promise<TestResult> {
197244
resolvedIp: dnsResult.ip,
198245
latencyMs: Date.now() - start,
199246
httpStatus: response.status,
247+
critical: target.critical,
200248
};
201249
} catch (err: any) {
202250
return {
@@ -207,6 +255,38 @@ async function testHttp(target: TestTarget): Promise<TestResult> {
207255
resolvedIp: dnsResult.ip,
208256
error: err.message,
209257
errorCode: err.code || err.cause?.code,
258+
critical: target.critical,
210259
};
211260
}
212261
}
262+
263+
/**
264+
* Publish connectivity metrics to Datadog via janus-core stats
265+
*/
266+
function publishMetrics(result: TestResult): void {
267+
const endpoint = `${result.host}:${result.port}`;
268+
const tags = [
269+
`endpoint:${endpoint}`,
270+
`host:${result.host}`,
271+
`protocol:${result.protocol}`,
272+
`critical:${result.critical || false}`,
273+
];
274+
275+
// Connectivity status metric (1 = success, 0 = failure)
276+
stats.gauge('endpoint.status', result.success ? 1 : 0, tags);
277+
278+
// Response time metric (using timing for latency measurements)
279+
if (result.latencyMs !== undefined) {
280+
stats.timing('endpoint.latency', result.latencyMs, tags);
281+
}
282+
283+
// Count metrics for success/failure
284+
if (result.success) {
285+
stats.increment('endpoint.success.count', 1, tags);
286+
} else {
287+
stats.increment('endpoint.error.count', 1, tags);
288+
if (result.errorCode) {
289+
stats.increment('endpoint.error.count', 1, [...tags, `error_code:${result.errorCode}`]);
290+
}
291+
}
292+
}

modules/connectivity_check/lambda/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"engines": {
66
"node": "~22"
77
},
8-
"dependencies": {},
8+
"dependencies": {
9+
"@racker/janus-core": "^12.4.0"
10+
},
911
"devDependencies": {
1012
"@types/node": "~22",
1113
"typescript": "~5"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"compilerOptions": {
3+
"target": "es2024",
4+
"module": "nodenext",
5+
"lib": ["ES2024"],
6+
"outDir": ".",
7+
"rootDir": ".",
8+
"strict": false,
9+
"skipLibCheck": true,
10+
"resolveJsonModule": true,
11+
"moduleResolution": "nodenext",
12+
"esModuleInterop": true,
13+
"allowSyntheticDefaultImports": true,
14+
"forceConsistentCasingInFileNames": true
15+
},
16+
"include": ["index.ts"],
17+
"exclude": ["node_modules"]
18+
}

0 commit comments

Comments
 (0)