riksdagsmonitor/scripts/render-lib/article-head-metadata.ts at main · Hack23/riksdagsmonitor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/**
 * @module Infrastructure/RenderLib/ArticleHeadMetadata
 * @category Intelligence Operations / Supporting Infrastructure
 * @name Article `<head>` metadata composer (shared between renderer & QA tools)
 *
 * @description
 * Deterministic helper (side-effect free aside from a one-time memoised
 * registry load) that, given an aggregated `article.md` (with
 * front-matter + body) plus a target language and canonical path,
 * produces the **exact** set of `<head>` metadata values that
 * {@link ./article.ts | renderArticleHtml} embeds into a rendered news
 * page — title, branded title, description, keywords, article-type
 * label and parsed publication date.
 *
 * This module exists so the regenerate / test pipelines and the
 * Markdown→HTML article renderer share a single source of truth for
 * "what ships in `<head>` for a given `article.md`". Tests and the
 * `test-article-headers` CLI can call this function and be sure they
 * are observing exactly what the shipped corpus sees — no drift, no
 * forked SEO logic.
 *
 * Note: `loadArticleTypesRegistry()` reads `analysis/article-types.json`
 * on first call and caches the result — the module is therefore not
 * purely functional in the strict sense, but all subsequent calls are
 * fully deterministic for the same inputs.
 *
 * @author Hack23 AB (Infrastructure Team)
 * @license Apache-2.0
 */

import matter from 'gray-matter';

import type { Language } from '../types/language.js';
import { articleTypeLabel } from './article-type-i18n.js';
import { getBySubfolder, getById, loadArticleTypesRegistry } from './article-types.js';
import type { ArticleSeoMetadata } from './article-seo.js';
import { buildArticleSeoMetadata } from './article-seo.js';
import { brandTitle, DEFAULT_ARTICLE_SECTION } from './chrome/head.js';

/**
 * Hard-coded fallback labels — kept only for legacy article types not yet
 * in the registry. New types should ONLY add a registry entry.
 *
 * This is the single authoritative source for legacy article-type labels;
 * the renderer delegates all article-type label resolution to this module
 * via {@link computeArticleHeadMetadata}.
 */
const ARTICLE_TYPE_LABELS_FALLBACK: Record<string, string> = {
  'deep-inspection': 'Deep inspection',
  realtime: 'Realtime pulse',
  'realtime-pulse': 'Realtime pulse',
  breaking: 'Breaking intelligence',
  'parliament-agenda': 'Parliament agenda',
};

function getArticleTypeLabel(type: string): string {
  const entry = getById(type) ?? getBySubfolder(type);
  if (entry) return entry.label;
  return ARTICLE_TYPE_LABELS_FALLBACK[type] ?? 'Political intelligence';
}

function normalizeArticleType(value: string): string {
  return value
    .replace(/committeeReports/g, 'committee-reports')
    .replace(/([a-z])([A-Z])/g, '$1-$2')
    .replace(/[^a-zA-Z0-9]+/g, '-')
    .replace(/^-+|-+$/g, '')
    .toLowerCase();
}

/**
 * Infer the article type (`propositions`, `motions`, `committee-reports`,
 * …) from the canonical path or article title. The registry is the
 * primary source; legacy hard-coded candidates remain for older folders
 * that pre-date the registry.
 *
 * Exported so the head-metadata helper and the regenerator can derive
 * the localized `articleTypeLabel` deterministically — and so test
 * scripts can verify type classification without rebuilding the regex
 * locally.
 */
export function inferArticleType(canonicalPath: string, title: string): { type: string; label: string } {
  const source = `${canonicalPath} ${title}`.toLowerCase();

  const registry = loadArticleTypesRegistry();
  for (const entry of registry.types) {
    if (source.includes(entry.subfolder.toLowerCase()) || source.includes(entry.id.toLowerCase())) {
      return { type: normalizeArticleType(entry.id), label: entry.label };
    }
  }

  const legacyCandidates = [
    'committeeReports',
    'deep-inspection',
    'realtime-pulse',
    'realtime',
    'breaking',
    'parliament-agenda',
  ];
  const match = legacyCandidates.find((candidate) => source.includes(candidate.toLowerCase()));
  const type = normalizeArticleType(match ?? 'political-intelligence');
  // Always look up the label with the *normalized* type slug so that
  // camelCase legacy candidate names (e.g. `committeeReports`) map to
  // the same registry/fallback entry as their hyphenated equivalents
  // (`committee-reports`).  Using the raw `match` string here would
  // miss registry entries and fall through to the generic default.
  return {
    type,
    label: getArticleTypeLabel(type),
  };
}

/**
 * Coerce `date` front-matter into a `YYYY-MM-DD` string. Accepts a
 * `Date` (gray-matter parses ISO dates eagerly), a string starting
 * with `YYYY-MM-DD`, or falls back to "today".
 *
 * @param dateRaw The `date:` field as returned by gray-matter.
 * @param now     Injection seam for "today" — defaults to `new Date()`.
 *                Tests pass a frozen clock to make assertions deterministic.
 * @returns       A `YYYY-MM-DD` string.
 */
export function parseFrontMatterDate(dateRaw: unknown, now: Date = new Date()): string {
  if (dateRaw instanceof Date && !Number.isNaN(dateRaw.getTime())) {
    return dateRaw.toISOString().slice(0, 10);
  }
  if (typeof dateRaw === 'string' && /^\d{4}-\d{2}-\d{2}/.test(dateRaw)) {
    return dateRaw.slice(0, 10);
  }
  return now.toISOString().slice(0, 10);
}

/**
 * Input for {@link computeArticleHeadMetadata}.
 */
export interface ArticleHeadMetadataInput {
  /** Aggregated markdown (front-matter + body) produced by aggregateAnalysis. */
  readonly markdown: string;
  /** Target language code. */
  readonly lang: Language;
  /** Canonical path (e.g. `news/2026-04-23-propositions-en.html`). */
  readonly canonicalPath: string;
  /**
   * Optional clock seam used by {@link parseFrontMatterDate} when the
   * front-matter `date:` is missing or malformed. Defaults to `new Date()`.
   */
  readonly now?: Date;
  /**
   * Pre-parsed front-matter data (the `.data` record returned by
   * `gray-matter`). When provided, the internal `matter()` call is
   * skipped, avoiding a duplicate parse in callers (e.g.
   * `renderArticleHtml`) that have already parsed the markdown.
   */
  readonly parsedData?: Record<string, unknown>;
  /**
   * Brief-derived title — when set (non-empty), this trumps the
   * `fm.title` frontmatter line. Sourced directly from the executive
   * brief by `renderArticleHtml` (see {@link ./article.ts}). Leaving
   * this unset preserves the legacy frontmatter-only audit path so the
   * 278 pre-`2026-03-26` legacy `news/*-en.html` files (whose
   * `analysis/daily/<date>/` sources have been deleted) keep their
   * existing SEO without throwing.
   */
  readonly briefDerivedTitle?: string;
  /**
   * Brief-derived description — when set (non-empty), this trumps the
   * `fm.description` frontmatter line. Companion to
   * {@link briefDerivedTitle}.
   */
  readonly briefDerivedDescription?: string;
  /**
   * Brief-derived entity tokens (bill IDs, committee codes, party
   * codes, named entities) — when provided, these seed the SERP
   * `keywords` string in `buildArticleSeoMetadata` *instead of*
   * re-parsing the (now-deprecated) `fm.keywords` line. The aggregator
   * no longer writes `keywords:` into `article.md` since 2026-05-24;
   * the renderer mines entities directly from the brief markdown.
   */
  readonly briefDerivedEntities?: readonly string[];
}

/**
 * Result of {@link computeArticleHeadMetadata}.
 *
 * Carries both the raw front-matter inputs that fed the SEO composer
 * **and** the computed `<head>` strings that ship in the rendered HTML,
 * so consumers (renderer, QA reports) get a complete picture in one
 * call.
 */
export interface ArticleHeadMetadata {
  /** Raw `title:` from the article.md front-matter (post-cascade). */
  readonly rawTitle: string;
  /** Raw `description:` from the article.md front-matter (post-cascade). */
  readonly rawDescription: string;
  /** Raw `keywords:` from the article.md front-matter (post-cascade), or `undefined`. */
  readonly rawKeywords: string | undefined;
  /** Normalised publication date (YYYY-MM-DD). */
  readonly date: string;
  /** Article-type ID slug (`propositions`, `committee-reports`, …). */
  readonly articleTypeId: string;
  /** Localized article-type label (`Propositions`, `Komitéindstillinger`, …). */
  readonly articleTypeLabel: string;
  /**
   * The `article:section` / `articleSection` value passed to chrome
   * and JSON-LD by the renderer. Exposed here so the audit CLI reports
   * exactly what ships in the rendered HTML without re-implementing
   * the same derivation.
   */
  readonly articleSection: string;
  /**
   * Computed SEO `<title>` / `<meta description>` / `<meta keywords>`
   * triple from {@link buildArticleSeoMetadata} — i.e. exactly what the
   * renderer hands to chrome.
   */
  readonly seo: ArticleSeoMetadata;
  /**
   * Branded `<title>` as emitted by `chrome/head.ts`:
   * `seo.title` unchanged when it already mentions "Riksdagsmonitor",
   * otherwise `${seo.title} — Riksdagsmonitor`.
   */
  readonly brandedTitle: string;
}

/**
 * Compute the canonical `<head>` metadata for an `article.md` exactly
 * the way the article renderer does. Used by:
 *
 *  - {@link ./article.ts | renderArticleHtml} during real article
 *    generation
 *  - {@link ../test-article-headers.ts | test-article-headers} CLI for
 *    auditing the shipped corpus
 *
 * The two callers MUST share this function so the audit report can
 * never drift from what is actually rendered into HTML.
 */
export function computeArticleHeadMetadata(input: ArticleHeadMetadataInput): ArticleHeadMetadata {
  const fm = (input.parsedData ?? matter(input.markdown).data) as Record<string, unknown>;
  // Brief-derived values trump frontmatter when set (post-2026-05-24
  // cascade: executive-brief.md is the single source of truth for
  // `<title>` / `<meta description>` / JSON-LD `headline` /
  // `description`). Legacy frontmatter-only audit path is preserved
  // for the 278 pre-`2026-03-26` `news/*-en.html` files whose
  // `analysis/daily/<date>/` source directories have been deleted.
  const briefTitle = input.briefDerivedTitle?.trim();
  const briefDescription = input.briefDerivedDescription?.trim();
  const fmTitleRaw = typeof fm.title === 'string' ? fm.title.trim() : '';
  // Localized fallback for the absolute last-resort title. The previous
  // implementation shipped the English literal `'Political Intelligence'`
  // for every locale, which surfaced in news/index_sv.html, _de.html,
  // _fr.html, … cards whenever neither the brief nor `fm.title:`
  // resolved (audit 2026-05-25). Use the per-language label for the
  // `political-intelligence` type and date-stamp it so the SERP carries
  // a real signal even on the legacy frontmatter-only audit path.
  const date = parseFrontMatterDate(fm.date, input.now);
  const fallbackTitle = `${articleTypeLabel('political-intelligence', input.lang, 'Political Intelligence')} — ${date}`;
  const rawTitle = briefTitle && briefTitle.length > 0
    ? briefTitle
    : (fmTitleRaw.length > 0 ? fmTitleRaw : fallbackTitle);
  const rawDescription = briefDescription && briefDescription.length > 0
    ? briefDescription
    : String(fm.description ?? 'Riksdagsmonitor political intelligence report.');
  const rawKeywords = typeof fm.keywords === 'string' ? fm.keywords : undefined;
  const articleType = inferArticleType(input.canonicalPath, rawTitle);
  const localizedArticleTypeLabel = articleTypeLabel(articleType.type, input.lang, articleType.label);
  // Brief entities seed the SERP `keywords` string. Prefer the
  // explicit brief-derived list passed by the renderer (post-2026-05-24
  // path); fall back to parsing `fm.keywords` for the legacy audit
  // path that has no live brief.
  const briefEntities = input.briefDerivedEntities && input.briefDerivedEntities.length > 0
    ? input.briefDerivedEntities
    : parseKeywordsString(rawKeywords);
  const seo = buildArticleSeoMetadata({
    title: rawTitle,
    description: rawDescription,
    keywords: rawKeywords,
    briefEntities,
    lang: input.lang,
    date,
    articleTypeLabel: localizedArticleTypeLabel,
    articleTypeId: articleType.type,
    canonicalPath: input.canonicalPath,
  });
  const computedBrandedTitle = brandTitle(seo.title, input.lang);
  // Mirror the section value passed to buildChrome so the audit CLI
  // reports exactly what ships in the rendered HTML. Sourced from the
  // shared `DEFAULT_ARTICLE_SECTION` constant in chrome/head.ts so the
  // two derivations cannot drift.
  const articleSection = DEFAULT_ARTICLE_SECTION;
  return {
    rawTitle,
    rawDescription,
    rawKeywords,
    date,
    articleTypeId: articleType.type,
    articleTypeLabel: localizedArticleTypeLabel,
    seo,
    brandedTitle: computedBrandedTitle,
    articleSection,
  };
}

/**
 * Split a `keywords:` front-matter line into trimmed, non-empty tokens.
 * Treats both ASCII `,` and the wide CJK `、`/`，` separators as splits
 * so localized keyword strings (mostly Latin commas, but some translator
 * outputs use native punctuation) round-trip correctly. Returns `[]` for
 * `undefined` / empty inputs.
 */
function parseKeywordsString(raw: string | undefined): readonly string[] {
  if (!raw) return [];
  return raw
    .split(/[,，、]/)
    .map((s) => s.trim())
    .filter((s) => s.length > 0);
}