Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions graphify/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@
except Exception:
_EXTRACTOR_VERSION = "unknown"

# Local extractor-logic salt: the package version alone does not change when the
# extractor source is edited in place (forks/dev), so AST cache entries would be
# served stale. Bump this whenever extractor output changes (e.g. the GraphQL SDL
# @key/federation work, or the gql call-site extraction) to invalidate the AST
# cache namespace.
_EXTRACTOR_VERSION = f"{_EXTRACTOR_VERSION}+sdlfed1+gqlcalls1"

# Version dirs already swept this process — cleanup runs once per (base, version).
_cleaned_ast_dirs: set[str] = set()

Expand Down
2 changes: 1 addition & 1 deletion graphify/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class FileType(str, Enum):

_MANIFEST_PATH = str(out_path("manifest.json"))

CODE_EXTENSIONS = {'.py', '.ts', '.tsx', '.js', '.jsx', '.mjs', '.ejs', '.ets', '.go', '.rs', '.java', '.groovy', '.gradle', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.cu', '.cuh', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.luau', '.toc', '.zig', '.ps1', '.psm1', '.psd1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.astro', '.dart', '.v', '.sv', '.svh', '.sql', '.r', '.f', '.F', '.f90', '.F90', '.f95', '.F95', '.f03', '.F03', '.f08', '.F08', '.pas', '.pp', '.dpr', '.dpk', '.lpr', '.inc', '.dfm', '.lfm', '.lpk', '.sh', '.bash', '.json', '.tf', '.tfvars', '.hcl', '.dm', '.dme', '.dmi', '.dmm', '.dmf', '.sln', '.slnx', '.csproj', '.fsproj', '.vbproj', '.razor', '.cshtml', '.cls', '.trigger'}
CODE_EXTENSIONS = {'.py', '.ts', '.tsx', '.js', '.jsx', '.mjs', '.ejs', '.ets', '.go', '.rs', '.java', '.groovy', '.gradle', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.cu', '.cuh', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.luau', '.toc', '.zig', '.ps1', '.psm1', '.psd1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.astro', '.dart', '.v', '.sv', '.svh', '.sql', '.r', '.f', '.F', '.f90', '.F90', '.f95', '.F95', '.f03', '.F03', '.f08', '.F08', '.pas', '.pp', '.dpr', '.dpk', '.lpr', '.inc', '.dfm', '.lfm', '.lpk', '.sh', '.bash', '.json', '.tf', '.tfvars', '.hcl', '.dm', '.dme', '.dmi', '.dmm', '.dmf', '.sln', '.slnx', '.csproj', '.fsproj', '.vbproj', '.razor', '.cshtml', '.cls', '.trigger', '.graphqls', '.graphql'}
DOC_EXTENSIONS = {'.md', '.mdx', '.qmd', '.txt', '.rst', '.html', '.yaml', '.yml'}
PAPER_EXTENSIONS = {'.pdf'}
IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg'}
Expand Down
212 changes: 211 additions & 1 deletion graphify/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -7651,6 +7651,11 @@ def _disambiguate_colliding_node_ids(
for node in nodes:
if node.get("type") == "module":
continue
# GraphQL SDL nodes use name-keyed ids (gql_<name>) deliberately: the same
# type declared in a repo's private AND public schema is the SAME federated
# entity, so it must collapse to one node, not split per file (#dup-entity).
if str(node.get("type", "")).startswith("gql"):
continue
nid = node.get("id")
if isinstance(nid, str) and nid:
by_id.setdefault(nid, []).append(node)
Expand Down Expand Up @@ -12566,6 +12571,11 @@ def _body_of(block):
}


# Code suffixes that can embed GraphQL operation call sites in string literals
# (gql`...` tagged templates in TS/JS, graphql:"..." struct tags in Go).
_GQL_CALL_SUFFIXES = {".go", ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", ".mts", ".cts"}


def _get_extractor(path: Path) -> Any | None:
"""Return the correct extractor function for a file, or None if unsupported."""
if path.name.endswith(".blade.php"):
Expand All @@ -12580,7 +12590,15 @@ def _get_extractor(path: Path) -> Any | None:
# (#1377). apm.yml would otherwise be a .yml document handled by the LLM.
if is_package_manifest_path(path):
return extract_package_manifest
return _DISPATCH.get(path.suffix)
if path.suffix in (".graphqls", ".graphql"):
from graphify.graphql_sdl import extract_graphql_sdl
return extract_graphql_sdl
base = _DISPATCH.get(path.suffix)
# TS/JS/Go can embed GraphQL operation call sites in string literals; fold
# their extraction into the per-file result so it caches like the AST nodes.
if base is not None and path.suffix in _GQL_CALL_SUFFIXES:
return _compose_with_gql_calls(base)
return base


def _extract_single_file(args: tuple) -> tuple[int, dict]:
Expand Down Expand Up @@ -12746,6 +12764,191 @@ def _extract_sequential(
_PARALLEL_THRESHOLD = 20


def _consolidate_gql_duplicates(all_nodes: list[dict]) -> int:
"""Collapse same-id GraphQL SDL nodes (a type declared in both the private and
public schema of one repo) into a single node. An ``@key`` entity marking and
its key fields win over a plain ``gql_type`` so the federation owner is never
lost to merge order. Mutates ``all_nodes`` in place; returns nodes removed.
"""
first: dict[str, dict] = {}
dropped = 0
keep: list[dict] = []
for n in all_nodes:
if not str(n.get("type", "")).startswith("gql"):
keep.append(n)
continue
nid = n.get("id")
prev = first.get(nid)
if prev is None:
first[nid] = n
keep.append(n)
continue
# merge into the already-kept node, preferring the entity marking
dropped += 1
if n.get("type") == "gql_entity":
if prev.get("type") != "gql_entity":
prev["type"] = "gql_entity"
if n.get("federation") == "entity" or prev.get("federation") is None:
prev["federation"] = n.get("federation", prev.get("federation"))
keys = sorted(set(prev.get("key_fields", []) or []) | set(n.get("key_fields", []) or []))
if keys:
prev["key_fields"] = keys
if dropped:
all_nodes[:] = keep
return dropped


def _anchor_gql_calls(base_nodes: list[dict], call_nodes: list[dict]) -> list[dict]:
"""Anchor each ``gql_call`` site to the nearest code symbol defined above it
in the same file (so the node isn't an island within its repo). Returns
``references`` edges from that enclosing symbol to the call node.
"""
anchors: list[tuple[int, str]] = []
for n in base_nodes:
if n.get("file_type") != "code" or str(n.get("type", "")).startswith("gql"):
continue
loc = str(n.get("source_location") or "")
if not loc.startswith("L"):
continue
try:
anchors.append((int(loc[1:].split("-")[0]), n["id"]))
except ValueError:
continue
anchors.sort()
edges: list[dict] = []
for cn in call_nodes:
try:
ln = int(str(cn.get("source_location"))[1:])
except (ValueError, TypeError):
continue
prev_id = None
for a_line, a_id in anchors:
if a_line <= ln:
prev_id = a_id
else:
break
if prev_id and prev_id != cn["id"]:
edges.append({
"source": prev_id,
"target": cn["id"],
"relation": "references",
"confidence": "EXTRACTED",
"confidence_score": 1.0,
"source_file": cn.get("source_file"),
"source_location": cn.get("source_location"),
"weight": 1.0,
})
return edges


def _compose_with_gql_calls(base_extractor):
"""Wrap a code extractor so each file's result also carries the GraphQL
operation *call sites* it embeds (``gql`...` `` / ``graphql:"..."`` literals,
which tree-sitter sees as opaque text). Folded into the per-file result so it
is cached and incremental like the AST nodes. No-op for files without a
GraphQL literal.
"""
from graphify.graphql_calls import extract_gql_calls

def _composed(path: Path) -> dict:
result = base_extractor(path)
if "error" in result:
return result
call_nodes = extract_gql_calls(path).get("nodes", [])
if call_nodes:
base_nodes = result.get("nodes", [])
result["edges"] = result.get("edges", []) + _anchor_gql_calls(base_nodes, call_nodes)
result["nodes"] = base_nodes + call_nodes
return result

return _composed


def _link_gql_calls_to_operations(all_nodes: list[dict], all_edges: list[dict]) -> int:
"""Link ``gql_call`` call sites to the ``gql_operation`` they invoke, by name,
*within one repo* (e.g. a service calling its own operation). Cross-repo
links are added by the global stitch. Name-based, so edges are INFERRED.
"""
ops_by_name: dict[str, str] = {}
for n in all_nodes:
if n.get("type") == "gql_operation":
ops_by_name.setdefault(str(n.get("label", "")), n["id"])
if not ops_by_name:
return 0
existing = {(e.get("source"), e.get("target")) for e in all_edges}
added = 0
for n in all_nodes:
if n.get("type") != "gql_call":
continue
tgt = ops_by_name.get(str(n.get("op_name", "")))
if tgt and tgt != n["id"] and (n["id"], tgt) not in existing:
existing.add((n["id"], tgt))
all_edges.append({
"source": n["id"],
"target": tgt,
"relation": "calls",
"confidence": "INFERRED",
"confidence_score": 0.8,
"source_file": n.get("source_file", ""),
"source_location": n.get("source_location"),
"weight": 1.0,
})
added += 1
return added


def _link_gql_operations_to_resolvers(all_nodes: list[dict], all_edges: list[dict]) -> int:
"""Connect GraphQL operations (from the SDL extractor) to the code functions
that implement them, so the contract layer isn't an island floating off the
AST. Matches a ``gql_operation`` node's name to a callable code node by
normalized identifier — e.g. operation ``createPilotDistro`` -> resolver
``.CreatePilotDistro()``. Name-based, so edges are INFERRED. Returns count.
"""
ops = [n for n in all_nodes if n.get("type") == "gql_operation"]
if not ops:
return 0

def _core(label: str) -> str:
s = str(label).strip()
if s.startswith("."):
s = s[1:]
if s.endswith("()"):
s = s[:-2]
return s.lower()

# Index callable (function/method) code nodes by normalized name. Restrict to
# labels ending in ')' so we target resolvers/functions, not types or fields.
callable_by_core: dict[str, str] = {}
for n in all_nodes:
if str(n.get("type", "")).startswith("gql"):
continue
if n.get("file_type") != "code":
continue
lbl = str(n.get("label", ""))
if not lbl.endswith(")"):
continue
callable_by_core.setdefault(_core(lbl), n["id"])

existing = {(e.get("source"), e.get("target")) for e in all_edges}
added = 0
for op in ops:
tgt = callable_by_core.get(str(op.get("label", "")).lower())
if tgt and tgt != op["id"] and (op["id"], tgt) not in existing:
existing.add((op["id"], tgt))
all_edges.append({
"source": op["id"],
"target": tgt,
"relation": "implemented_by",
"confidence": "INFERRED",
"confidence_score": 0.85,
"source_file": op.get("source_file", ""),
"source_location": op.get("source_location"),
"weight": 1.0,
})
added += 1
return added


def extract(
paths: list[Path],
cache_root: Path | None = None,
Expand Down Expand Up @@ -13118,6 +13321,13 @@ def _has_import_evidence(candidate_id: str) -> bool:
for n in all_nodes:
n["_origin"] = "ast"

# Collapse private/public duplicates of the same GraphQL type into one node
# (entity marking wins), then bridge the SDL contract to its implementing code
# by linking each operation to the resolver function that implements it.
_consolidate_gql_duplicates(all_nodes)
_link_gql_operations_to_resolvers(all_nodes, all_edges)
_link_gql_calls_to_operations(all_nodes, all_edges)

return {
"nodes": all_nodes,
"edges": all_edges,
Expand Down
94 changes: 94 additions & 0 deletions graphify/global_graph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations
import collections
import json
import hashlib
import sys
Expand Down Expand Up @@ -74,6 +75,91 @@ def _file_hash(path: Path) -> str:
return h.hexdigest()[:16]


def _stitch_federation(G: nx.Graph) -> int:
"""Link Apollo Federation entities across repos in the global graph.

The SDL extractor tags entity types with ``federation='entity'`` (the service
that owns ``type X @key``) or ``federation='extends'`` (a service that
``extend type X @key`` references it). Same entity name in two repos = the
same federated entity, so each reference gets a ``federation_key`` edge to the
owner. Idempotent: prior ``federation_key`` edges are dropped first, so it is
safe to re-run after every ``global_add``.
"""
stale = [(u, v) for u, v, d in G.edges(data=True)
if d.get("relation") == "federation_key"]
G.remove_edges_from(stale)

origins: dict[str, list[str]] = collections.defaultdict(list)
refs: dict[str, list[str]] = collections.defaultdict(list)
for nid, d in G.nodes(data=True):
if d.get("type") != "gql_entity":
continue
name = str(d.get("label", "")).split(" ", 1)[0]
if not name:
continue
if d.get("federation") == "entity":
origins[name].append(nid)
elif d.get("federation") == "extends":
refs[name].append(nid)

added = 0
for name, ref_ids in refs.items():
for ref in ref_ids:
ref_repo = G.nodes[ref].get("repo")
for origin in origins.get(name, []):
if G.nodes[origin].get("repo") == ref_repo:
continue
G.add_edge(ref, origin, relation="federation_key", confidence="EXTRACTED",
confidence_score=1.0, source_file="<federation:@key>", weight=1.0)
added += 1
return added


def _stitch_gql_calls(G: nx.Graph) -> int:
"""Link GraphQL operation *call sites* to the operations they invoke, across
repos, in the global graph.

The call-site extractor tags each ``gql`...` `` / ``graphql:"..."`` usage as a
``gql_call`` node carrying ``op_name``; the SDL extractor owns the matching
``gql_operation`` in whatever service defines the schema. A frontend calling
a backend mutation is the common cross-repo case, so each ``gql_call`` gets a
``calls`` edge to the operation node of the same name. With this edge,
``graphify affected "<operation>"`` reverse-traverses to every consumer a
backend change would affect. Idempotent: prior ``calls`` edges are dropped
first, so it is safe to re-run after every ``global_add``.
"""
stale = [(u, v) for u, v, d in G.edges(data=True)
if d.get("relation") == "calls"]
G.remove_edges_from(stale)

ops_by_name: dict[str, list[str]] = collections.defaultdict(list)
calls_by_name: dict[str, list[str]] = collections.defaultdict(list)
for nid, d in G.nodes(data=True):
t = d.get("type")
if t == "gql_operation":
name = str(d.get("label", "")).split(" ", 1)[0]
if name:
ops_by_name[name].append(nid)
elif t == "gql_call":
name = str(d.get("op_name") or d.get("label", ""))
if name:
calls_by_name[name].append(nid)

added = 0
for name, call_ids in calls_by_name.items():
targets = ops_by_name.get(name)
if not targets:
continue
for call in call_ids:
for op in targets:
if call == op:
continue
G.add_edge(call, op, relation="calls", confidence="INFERRED",
confidence_score=0.8, source_file="<gql:call-site>", weight=1.0)
added += 1
return added


def global_add(source_path: Path, repo_tag: str) -> dict:
"""Add or update a project graph in the global graph.

Expand Down Expand Up @@ -142,6 +228,14 @@ def global_add(source_path: Path, repo_tag: str) -> dict:
G.add_edge(u, v, **data)

added = prefixed.number_of_nodes() - len(remap)

# Re-stitch cross-repo federation @key links now that this repo's entities
# are present (idempotent — recomputed over the whole graph each add).
_stitch_federation(G)
# Link GraphQL call sites to the operations they invoke across repos
# (frontend -> backend mutation); idempotent, same rationale.
_stitch_gql_calls(G)

_save_global_graph(G)

manifest["repos"][repo_tag] = {
Expand Down
Loading