diff --git a/.cursor/rules/typescript-coding-guidelines.mdc b/.cursor/rules/typescript-coding-guidelines.mdc index 44d73e74271..1230b605b1d 100644 --- a/.cursor/rules/typescript-coding-guidelines.mdc +++ b/.cursor/rules/typescript-coding-guidelines.mdc @@ -26,7 +26,7 @@ alwaysApply: false # Workspaces - The project uses yarn workspaces. -- If you want to install a dependency, you need to do it in the relevant workspace. e.g. `hash/apps/hash-frontend`. +- If you want to install a dependency, you need to do it in the relevant workspace. e.g. `apps/hash-frontend`. - The project # Frontend diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts index 092d170fed2..a59e6f14290 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/answer-question-action.ts @@ -1,5 +1,9 @@ import { extractEntityUuidFromEntityId } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, +} from "@local/hash-backend-utils/flows/payload-storage"; import { getSimpleGraph } from "@local/hash-backend-utils/simplified-graph"; import { queryEntitySubgraph } from "@local/hash-graph-sdk/entity"; import type { AiActionStepOutput } from "@local/hash-isomorphic-utils/flows/action-definitions"; @@ -401,7 +405,7 @@ export const answerQuestionAction: AiFlowActionActivity< > = async ({ inputs }) => { const { context, - entities: inputEntities, + entities: entitiesInput, question, } = getSimplifiedAiFlowActionInputs({ inputs, @@ -410,6 +414,15 @@ export const answerQuestionAction: AiFlowActionActivity< const { userAuthentication } = await getFlowContext(); + // Resolve the stored ref to get the array of PersistedEntitiesMetadata + const inputEntities = entitiesInput + ? await resolvePayloadValue( + getStorageProvider(), + "PersistedEntitiesMetadata", + entitiesInput, + ) + : undefined; + const entities = inputEntities ? await mapActionInputEntitiesToEntities({ actorId: userAuthentication.actorId, diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-entities-from-content-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-entities-from-content-action.ts index 8ad7d49c4b3..fd06288f415 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-entities-from-content-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-entities-from-content-action.ts @@ -12,6 +12,10 @@ import { } from "@blockprotocol/type-system"; import { typedKeys } from "@local/advanced-types/typed-entries"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import { isInferenceModelName } from "@local/hash-isomorphic-utils/ai-inference-types"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; import type { ProposedEntity } from "@local/hash-isomorphic-utils/flows/types"; @@ -43,7 +47,7 @@ export const inferEntitiesFromContentAction: AiFlowActionActivity< actionType: "inferEntitiesFromContent", }); - const { flowEntityId, userAuthentication, stepId, webId } = + const { flowEntityId, userAuthentication, stepId, webId, workflowId, runId } = await getFlowContext(); const aiAssistantAccountId = await getAiAssistantAccountIdActivity({ @@ -208,6 +212,17 @@ export const inferEntitiesFromContentAction: AiFlowActionActivity< }), ); + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: proposedEntities, + }); + return { code: StatusCode.Ok, contents: [ @@ -217,7 +232,7 @@ export const inferEntitiesFromContentAction: AiFlowActionActivity< outputName: "proposedEntities", payload: { kind: "ProposedEntity", - value: proposedEntities, + value: storedRef, }, }, ], diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts index 5e179a262cc..511480e7f9c 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/infer-metadata-from-document-action.ts @@ -7,6 +7,10 @@ import type { } from "@blockprotocol/type-system"; import { extractEntityUuidFromEntityId } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import type { HashEntity } from "@local/hash-graph-sdk/entity"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; import type { PersistedEntityMetadata } from "@local/hash-isomorphic-utils/flows/types"; @@ -40,9 +44,11 @@ export const inferMetadataFromDocumentAction: AiFlowActionActivity< > = async ({ inputs }) => { const { flowEntityId, + runId, stepId, userAuthentication: { actorId: userActorId }, webId, + workflowId, } = await getFlowContext(); const { documentEntityId } = getSimplifiedAiFlowActionInputs({ @@ -234,6 +240,17 @@ export const inferMetadataFromDocumentAction: AiFlowActionActivity< propertyProvenance, }); + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: proposedEntities, + }); + return { code: StatusCode.Ok, contents: [ @@ -243,7 +260,7 @@ export const inferMetadataFromDocumentAction: AiFlowActionActivity< outputName: "proposedEntities", payload: { kind: "ProposedEntity", - value: proposedEntities, + value: storedRef, }, }, { diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts index 95e55713baf..2db60d0a22d 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entities-action.ts @@ -1,5 +1,10 @@ import type { EntityId } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import { flattenPropertyMetadata } from "@local/hash-graph-sdk/entity"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; import type { @@ -8,19 +13,27 @@ import type { ProposedEntityWithResolvedLinks, } from "@local/hash-isomorphic-utils/flows/types"; import { StatusCode } from "@local/status"; +import { Context } from "@temporalio/activity"; -import { - fileEntityTypeIds, - persistEntityAction, -} from "./persist-entity-action.js"; +import { getFlowContext } from "../shared/get-flow-context.js"; +import { fileEntityTypeIds, persistEntity } from "./persist-entity-action.js"; export const persistEntitiesAction: AiFlowActionActivity< "persistEntities" > = async ({ inputs }) => { - const { draft, proposedEntities } = getSimplifiedAiFlowActionInputs({ - inputs, - actionType: "persistEntities", - }); + const { runId, stepId, workflowId } = await getFlowContext(); + + const { draft, proposedEntities: proposedEntitiesInput } = + getSimplifiedAiFlowActionInputs({ + inputs, + actionType: "persistEntities", + }); + + const proposedEntities = await resolvePayloadValue( + getStorageProvider(), + "ProposedEntity", + proposedEntitiesInput, + ); /** * Sort the entities to persist in dependency order: @@ -78,6 +91,9 @@ export const persistEntitiesAction: AiFlowActionActivity< * if an existing entity is found to update rather than a new one with the localId being created. */ for (const unresolvedEntity of entitiesWithDependenciesSortedLast) { + // Heartbeat to indicate the activity is still running + Context.current().heartbeat(); + const { claims, entityTypeIds, @@ -158,20 +174,9 @@ export const persistEntitiesAction: AiFlowActionActivity< } } - const persistedEntityOutputs = await persistEntityAction({ - inputs: [ - { - inputName: "draft", - payload: { kind: "Boolean", value: draft ?? false }, - }, - { - inputName: "proposedEntityWithResolvedLinks", - payload: { - kind: "ProposedEntityWithResolvedLinks", - value: entityWithResolvedLinks, - }, - }, - ], + const persistedEntityOutputs = await persistEntity({ + proposedEntityWithResolvedLinks: entityWithResolvedLinks, + draft: draft ?? false, }); const output = persistedEntityOutputs.contents[0]?.outputs[0]?.payload; @@ -210,6 +215,20 @@ export const persistEntitiesAction: AiFlowActionActivity< const persistedEntities = Object.values(persistedEntitiesByLocalId); + // Store the output in S3 to avoid passing large payloads through Temporal + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "persistedEntities", + kind: "PersistedEntitiesMetadata", + value: { + persistedEntities, + failedEntityProposals: Object.values(failedEntitiesByLocalId), + }, + }); + return { /** @todo H-2604 have some kind of 'partially completed' status when reworking flow return codes */ code: @@ -231,10 +250,7 @@ export const persistEntitiesAction: AiFlowActionActivity< outputName: "persistedEntities", payload: { kind: "PersistedEntitiesMetadata", - value: { - persistedEntities, - failedEntityProposals: Object.values(failedEntitiesByLocalId), - }, + value: storedRef, }, }, ], diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts index 24efa817ffd..e77ad737921 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/persist-entity-action.ts @@ -1,6 +1,10 @@ import type { EntityId, VersionedUrl } from "@blockprotocol/type-system"; import { extractEntityUuidFromEntityId } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, +} from "@local/hash-backend-utils/flows/payload-storage"; import { getWebMachineId } from "@local/hash-backend-utils/machine-actors"; import type { CreateEntityParameters } from "@local/hash-graph-sdk/entity"; import { @@ -9,7 +13,10 @@ import { mergePropertyObjectAndMetadata, } from "@local/hash-graph-sdk/entity"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; -import type { PersistedEntityMetadata } from "@local/hash-isomorphic-utils/flows/types"; +import type { + PersistedEntityMetadata, + ProposedEntityWithResolvedLinks, +} from "@local/hash-isomorphic-utils/flows/types"; import { systemEntityTypes } from "@local/hash-isomorphic-utils/ontology-type-ids"; import type { HasObject, @@ -47,9 +54,18 @@ export const fileEntityTypeIds: VersionedUrl[] = [ systemEntityTypes.pptxPresentation.entityTypeId, ]; -export const persistEntityAction: AiFlowActionActivity< - "persistEntity" -> = async ({ inputs }) => { +/** + * Inner function that handles the actual entity persistence logic. + * This is called by both persistEntityAction (which resolves the payload ref first) + * and persistEntitiesAction (which passes the resolved value directly). + */ +export const persistEntity = async ({ + proposedEntityWithResolvedLinks, + draft, +}: { + proposedEntityWithResolvedLinks: ProposedEntityWithResolvedLinks; + draft: boolean; +}): Promise>> => { const { flowEntityId, stepId, @@ -57,13 +73,7 @@ export const persistEntityAction: AiFlowActionActivity< webId, } = await getFlowContext(); - const { draft, proposedEntityWithResolvedLinks } = - getSimplifiedAiFlowActionInputs({ - inputs, - actionType: "persistEntity", - }); - - const createEditionAsDraft = draft ?? false; + const createEditionAsDraft = draft; const { entityTypeIds, @@ -336,3 +346,27 @@ export const persistEntityAction: AiFlowActionActivity< ], }; }; + +/** + * Flow action activity that persists a single entity. + */ +export const persistEntityAction: AiFlowActionActivity< + "persistEntity" +> = async ({ inputs }) => { + const { draft, proposedEntityWithResolvedLinks: proposedEntityInput } = + getSimplifiedAiFlowActionInputs({ + inputs, + actionType: "persistEntity", + }); + + const proposedEntityWithResolvedLinks = await resolvePayloadValue( + getStorageProvider(), + "ProposedEntityWithResolvedLinks", + proposedEntityInput, + ); + + return persistEntity({ + proposedEntityWithResolvedLinks, + draft: draft ?? false, + }); +}; diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/coordinating-agent.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/coordinating-agent.ts index c6a05dad5f1..aa2f030199d 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/coordinating-agent.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/coordinating-agent.ts @@ -5,6 +5,11 @@ import type { } from "@blockprotocol/type-system"; import { entityIdFromComponents } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import { flattenPropertyMetadata } from "@local/hash-graph-sdk/entity"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; import type { @@ -68,7 +73,7 @@ const parseAndResolveCoordinatorInputs = async (params: { const { prompt, entityTypeIds, - existingEntities: inputExistingEntities, + existingEntities: existingEntitiesInput, reportSpecification, } = getSimplifiedAiFlowActionInputs({ inputs: stepInputs, @@ -77,6 +82,15 @@ const parseAndResolveCoordinatorInputs = async (params: { const { userAuthentication } = await getFlowContext(); + // Resolve the stored ref to get the array of PersistedEntitiesMetadata + const inputExistingEntities = existingEntitiesInput + ? await resolvePayloadValue( + getStorageProvider(), + "PersistedEntitiesMetadata", + existingEntitiesInput, + ) + : undefined; + /** * @todo: simplify the properties in the existing entities */ @@ -160,7 +174,8 @@ export const runCoordinatingAgent: AiFlowActionActivity< testingParams, }); - const { flowEntityId, stepId, webId } = await getFlowContext(); + const { flowEntityId, runId, stepId, webId, workflowId } = + await getFlowContext(); const providedFileEntities = await getProvidedFiles(); @@ -583,6 +598,21 @@ export const runCoordinatingAgent: AiFlowActionActivity< }, ]); + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const allProposedEntitiesForOutput = [ + ...allProposedEntities, + ...fileEntityProposals, + ]; + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: allProposedEntitiesForOutput, + }); + return { code: StatusCode.Ok, contents: [ @@ -592,7 +622,7 @@ export const runCoordinatingAgent: AiFlowActionActivity< outputName: "proposedEntities", payload: { kind: "ProposedEntity", - value: [...allProposedEntities, ...fileEntityProposals], + value: storedRef, }, }, { diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent.ts index 797c17834bc..83efd97f8c7 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/research-entities-action/link-follower-agent.ts @@ -1,7 +1,6 @@ import type { SourceProvenance, Url } from "@blockprotocol/type-system"; import { currentTimestamp } from "@blockprotocol/type-system"; -import { getAwsS3Config } from "@local/hash-backend-utils/aws-config"; -import { AwsS3StorageProvider } from "@local/hash-backend-utils/file-storage/aws-s3-storage-provider"; +import { getStorageProvider } from "@local/hash-backend-utils/flows/payload-storage"; import type { WorkerIdentifiers } from "@local/hash-isomorphic-utils/flows/types"; import { Context } from "@temporalio/activity"; import dedent from "dedent"; @@ -145,11 +144,7 @@ const exploreResource = async (params: { ]; if (storageKey) { - const s3Config = getAwsS3Config(); - - const downloadProvider = new AwsS3StorageProvider(s3Config); - - urlForDownload = await downloadProvider.presignDownload({ + urlForDownload = await getStorageProvider().presignDownload({ entity: hashEntityForFile, expiresInSeconds: 60 * 60, key: storageKey, diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/create-file-entity-from-url.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/create-file-entity-from-url.ts index 3bd09d9e1c7..8736b1e0bcb 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/create-file-entity-from-url.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/shared/create-file-entity-from-url.ts @@ -19,12 +19,11 @@ import type { ProvidedEntityEditionProvenance, VersionedUrl, } from "@blockprotocol/type-system"; -import { getAwsS3Config } from "@local/hash-backend-utils/aws-config"; import { formatFileUrl, getEntityTypeIdForMimeType, } from "@local/hash-backend-utils/file-storage"; -import { AwsS3StorageProvider } from "@local/hash-backend-utils/file-storage/aws-s3-storage-provider"; +import { getStorageProvider } from "@local/hash-backend-utils/flows/payload-storage"; import { getWebMachineId } from "@local/hash-backend-utils/machine-actors"; import { HashEntity, @@ -264,20 +263,18 @@ export const createFileEntityFromUrl = async (params: { }, ); - const s3Config = getAwsS3Config(); - - const uploadProvider = new AwsS3StorageProvider(s3Config); + const storageProvider = getStorageProvider(); const editionIdentifier = generateUuid(); - const key = uploadProvider.getFileEntityStorageKey({ + const key = storageProvider.getFileEntityStorageKey({ entityId: incompleteFileEntity.metadata.recordId.entityId, editionIdentifier, filename, }); const { fileStorageProperties, presignedPut } = - await uploadProvider.presignUpload({ + await storageProvider.presignUpload({ expiresInSeconds: 60 * 60 * 24, // 24 hours headers: { "content-length": fileSizeInBytes, diff --git a/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts b/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts index a3fbdaafca9..b2d4bfab37a 100644 --- a/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts +++ b/apps/hash-ai-worker-ts/src/activities/flow-activities/write-google-sheet-action.ts @@ -3,6 +3,10 @@ import type { ProvidedEntityEditionProvenance, } from "@blockprotocol/type-system"; import type { AiFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, +} from "@local/hash-backend-utils/flows/payload-storage"; import { createGoogleOAuth2Client, getGoogleAccountById, @@ -12,6 +16,11 @@ import { getWebMachineId } from "@local/hash-backend-utils/machine-actors"; import type { VaultClient } from "@local/hash-backend-utils/vault"; import { HashEntity } from "@local/hash-graph-sdk/entity"; import { getSimplifiedAiFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; +import type { + PersistedEntitiesMetadata, + StoredPayloadRef, +} from "@local/hash-isomorphic-utils/flows/types"; +import { isStoredPayloadRef } from "@local/hash-isomorphic-utils/flows/types"; import { generateEntityIdFilter } from "@local/hash-isomorphic-utils/graph-queries"; import { googleEntityTypes, @@ -133,18 +142,33 @@ export const writeGoogleSheetAction: AiFlowActionActivity< */ let sheetRequests: sheets_v4.Schema$Request[] | undefined; - if ("format" in dataToWrite) { - if (dataToWrite.format !== "CSV") { + // Resolve stored ref if dataToWrite is a StoredPayloadRef (for PersistedEntitiesMetadata) + let resolvedDataToWrite: + | Exclude> + | PersistedEntitiesMetadata; + + if (isStoredPayloadRef(dataToWrite)) { + resolvedDataToWrite = await resolvePayloadValue( + getStorageProvider(), + "PersistedEntitiesMetadata", + dataToWrite, + ); + } else { + resolvedDataToWrite = dataToWrite; + } + + if ("format" in resolvedDataToWrite) { + if (resolvedDataToWrite.format !== "CSV") { return { code: StatusCode.InvalidArgument, - message: `Invalid text format '${dataToWrite.format}' provided, must be 'CSV'.`, + message: `Invalid text format '${resolvedDataToWrite.format}' provided, must be 'CSV'.`, contents: [], }; } try { sheetRequests = convertCsvToSheetRequests({ - csvString: dataToWrite.content, + csvString: resolvedDataToWrite.content, format: { audience }, }); } catch { @@ -155,21 +179,22 @@ export const writeGoogleSheetAction: AiFlowActionActivity< }; } } else { - const isPersistedEntities = "persistedEntities" in dataToWrite; - const queryFilter = isPersistedEntities - ? { - any: dataToWrite.persistedEntities.map((persistedEntityMetadata) => - generateEntityIdFilter({ - entityId: persistedEntityMetadata.entityId, - includeArchived: false, - }), - ), - } - : await getFilterFromBlockProtocolQueryEntity({ - authentication: { actorId: userAccountId }, - graphApiClient, - queryEntityId: dataToWrite, - }); + const queryFilter = + "persistedEntities" in resolvedDataToWrite + ? { + any: resolvedDataToWrite.persistedEntities.map( + (persistedEntityMetadata) => + generateEntityIdFilter({ + entityId: persistedEntityMetadata.entityId, + includeArchived: false, + }), + ), + } + : await getFilterFromBlockProtocolQueryEntity({ + authentication: { actorId: userAccountId }, + graphApiClient, + queryEntityId: resolvedDataToWrite, + }); const subgraph = await getSubgraphFromFilter({ authentication: { actorId: userAccountId }, @@ -181,34 +206,35 @@ export const writeGoogleSheetAction: AiFlowActionActivity< * * @todo once we start using a Structural Query instead, it can specify the traversal depth itself (1 becomes variable) */ - traversalPaths: isPersistedEntities - ? [] - : [ - { - edges: [ - { - kind: "has-left-entity", - direction: "incoming", - }, - { - kind: "has-right-entity", - direction: "outgoing", - }, - ], - }, - { - edges: [ - { - kind: "has-right-entity", - direction: "incoming", - }, - { - kind: "has-left-entity", - direction: "outgoing", - }, - ], - }, - ], + traversalPaths: + "persistedEntities" in resolvedDataToWrite + ? [] + : [ + { + edges: [ + { + kind: "has-left-entity", + direction: "incoming", + }, + { + kind: "has-right-entity", + direction: "outgoing", + }, + ], + }, + { + edges: [ + { + kind: "has-right-entity", + direction: "incoming", + }, + { + kind: "has-left-entity", + direction: "outgoing", + }, + ], + }, + ], }); sheetRequests = convertSubgraphToSheetRequests({ diff --git a/apps/hash-ai-worker-ts/src/activities/infer-entities/shared/extract-validation-failure-details.ts b/apps/hash-ai-worker-ts/src/activities/infer-entities/shared/extract-validation-failure-details.ts index 070979ad730..81e3e055cfe 100644 --- a/apps/hash-ai-worker-ts/src/activities/infer-entities/shared/extract-validation-failure-details.ts +++ b/apps/hash-ai-worker-ts/src/activities/infer-entities/shared/extract-validation-failure-details.ts @@ -1,10 +1,5 @@ import { stringifyError } from "@local/hash-isomorphic-utils/stringify-error"; -import { stringify } from "../../shared/stringify.js"; - -const generateErrorMessage = (err: unknown) => - err instanceof Error ? err.message : stringify(err); - const isTrueObject = (obj: unknown): obj is object => obj !== null && typeof obj === "object"; @@ -73,5 +68,5 @@ export const extractErrorMessage = (err: unknown) => { // eslint-disable-next-line no-console console.error(`Unexpected error message structure: ${stringifyError(err)}`); } - return generateErrorMessage(err); + return stringifyError(err); }; diff --git a/apps/hash-ai-worker-ts/src/activities/shared/get-flow-context.ts b/apps/hash-ai-worker-ts/src/activities/shared/get-flow-context.ts index 8c88093be4d..91e7ce4709d 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/get-flow-context.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/get-flow-context.ts @@ -1,14 +1,12 @@ -import type { - EntityId, - EntityUuid, - UserId, - WebId, -} from "@blockprotocol/type-system"; +import type { EntityId, UserId, WebId } from "@blockprotocol/type-system"; import { - entityIdFromComponents, extractEntityUuidFromEntityId, extractWebIdFromEntityId, } from "@blockprotocol/type-system"; +import { + getFlowContextCache, + getFlowEntityInfo, +} from "@local/hash-backend-utils/flows/get-flow-context"; import { createTemporalClient } from "@local/hash-backend-utils/temporal"; import { parseHistoryItemPayload } from "@local/hash-backend-utils/temporal/parse-history-item-payload"; import { type HashEntity, queryEntities } from "@local/hash-graph-sdk/entity"; @@ -21,28 +19,19 @@ import { normalizeWhitespace } from "@local/hash-isomorphic-utils/normalize"; import type { File } from "@local/hash-isomorphic-utils/system-types/shared"; import { Context } from "@temporalio/activity"; import type { Client as TemporalClient } from "@temporalio/client"; -import type { MemoryCache } from "cache-manager"; -import { caching } from "cache-manager"; import { graphApiClient } from "./graph-api-client.js"; let _temporalClient: TemporalClient | undefined; -let _runFlowWorkflowParamsCache: MemoryCache | undefined; - -type PartialRunFlowWorkflowParams = Pick< - RunAiFlowWorkflowParams, - "dataSources" | "webId" | "userAuthentication" -> & { createEntitiesAsDraft: boolean }; - -const getCache = async () => { - _runFlowWorkflowParamsCache = - _runFlowWorkflowParamsCache ?? - (await caching("memory", { - max: 100, // 100 items - ttl: 10 * 60 * 1000, // 10 minutes - })); - return _runFlowWorkflowParamsCache; +/** + * AI-specific workflow params that extend the base params with draft and data source info. + */ +type AiWorkflowParams = { + createEntitiesAsDraft: boolean; + dataSources: FlowDataSources; + userAuthentication: { actorId: UserId }; + webId: WebId; }; export const getTemporalClient = async () => { @@ -50,20 +39,21 @@ export const getTemporalClient = async () => { return _temporalClient; }; -const getPartialRunFlowWorkflowParams = async (params: { +/** + * Get AI-specific workflow params from Temporal workflow history. + * Extends the base workflow params with createEntitiesAsDraft and dataSources. + */ +const getAiWorkflowParams = async (params: { workflowId: string; -}): Promise => { +}): Promise => { const { workflowId } = params; - const runFlowWorkflowParamsCache = await getCache(); + const cache = await getFlowContextCache(); + const cacheKey = `aiWorkflowParams-${workflowId}`; - const cachedPartialRunFlowWorkflowParams = - await runFlowWorkflowParamsCache.get( - workflowId, - ); - - if (cachedPartialRunFlowWorkflowParams) { - return cachedPartialRunFlowWorkflowParams; + const cachedParams = await cache.get(cacheKey); + if (cachedParams) { + return cachedParams; } const temporalClient = await getTemporalClient(); @@ -113,32 +103,26 @@ const getPartialRunFlowWorkflowParams = async (params: { draftTriggerInputNames.includes(output.outputName as "draft"), )?.payload.value; - /** - * Avoid caching the entire `RunFlowWorkflowParams` object to reduce memory usage - * of the cache. - */ - const partialRunFlowWorkflowParams: PartialRunFlowWorkflowParams = { + const aiParams: AiWorkflowParams = { createEntitiesAsDraft, dataSources: runFlowWorkflowParams.dataSources, userAuthentication: runFlowWorkflowParams.userAuthentication, webId: runFlowWorkflowParams.webId, }; - await runFlowWorkflowParamsCache.set( - workflowId, - partialRunFlowWorkflowParams, - ); - - return partialRunFlowWorkflowParams; + await cache.set(cacheKey, aiParams); + return aiParams; }; type FlowContext = { createEntitiesAsDraft: boolean; dataSources: FlowDataSources; flowEntityId: EntityId; + runId: string; stepId: string; userAuthentication: { actorId: UserId }; webId: WebId; + workflowId: string; }; /** @@ -151,18 +135,21 @@ type FlowContext = { export const getFlowContext = async (): Promise => { const activityContext = Context.current(); - const { workflowId } = activityContext.info.workflowExecution; + const { workflowId, runId } = activityContext.info.workflowExecution; const { createEntitiesAsDraft, dataSources, userAuthentication, webId } = - await getPartialRunFlowWorkflowParams({ + await getAiWorkflowParams({ workflowId, }); - const flowEntityId = entityIdFromComponents( - webId, - // Assumes the flow entity UUID is the same as the workflow ID - workflowId as EntityUuid, - ); + // Query for the flow entity by workflowId (stored as a property on the entity) + // This is necessary because the entity UUID may not match the workflow ID + // Uses shared utility with retry logic for race condition handling + const { flowEntityId } = await getFlowEntityInfo({ + workflowId, + userAuthentication, + graphApiClient, + }); const { activityId: stepId } = Context.current().info; @@ -171,8 +158,10 @@ export const getFlowContext = async (): Promise => { dataSources, userAuthentication, flowEntityId, + runId, webId, stepId, + workflowId, }; }; @@ -188,7 +177,7 @@ export const getProvidedFiles = async (): Promise[]> => { } const filesCacheKey = `files-${flowEntityId}`; - const cache = await getCache(); + const cache = await getFlowContextCache(); const cachedFiles = await cache.get[]>(filesCacheKey); diff --git a/apps/hash-ai-worker-ts/src/activities/shared/map-action-input-entities-to-entities.ts b/apps/hash-ai-worker-ts/src/activities/shared/map-action-input-entities-to-entities.ts index ca22e2148fc..5ba67f23936 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/map-action-input-entities-to-entities.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/map-action-input-entities-to-entities.ts @@ -12,24 +12,24 @@ import { currentTimeInstantTemporalAxes } from "@local/hash-isomorphic-utils/gra export const mapActionInputEntitiesToEntities = async (params: { actorId: ActorEntityUuid; graphApiClient: GraphApi; - inputEntities: ( - | SerializedEntity - | PersistedEntityMetadata - | PersistedEntitiesMetadata - )[]; + inputEntities: + | SerializedEntity[] + | PersistedEntityMetadata[] + | PersistedEntitiesMetadata; }): Promise => { const { actorId, graphApiClient, inputEntities } = params; const entityIdsToFetch: EntityId[] = []; const directEntities: HashEntity[] = []; - for (const inputEntity of inputEntities) { + const inputEntitiesArray = + "persistedEntities" in inputEntities + ? inputEntities.persistedEntities + : inputEntities; + + for (const inputEntity of inputEntitiesArray) { if ("operation" in inputEntity) { entityIdsToFetch.push(inputEntity.entityId); - } else if ("persistedEntities" in inputEntity) { - for (const persistedEntity of inputEntity.persistedEntities) { - entityIdsToFetch.push(persistedEntity.entityId); - } } else { // SerializedEntity - convert directly directEntities.push(new HashEntity(inputEntity)); diff --git a/apps/hash-ai-worker-ts/src/activities/shared/use-file-system-file-from-url.ts b/apps/hash-ai-worker-ts/src/activities/shared/use-file-system-file-from-url.ts index ed823b36bfa..43a5f314291 100644 --- a/apps/hash-ai-worker-ts/src/activities/shared/use-file-system-file-from-url.ts +++ b/apps/hash-ai-worker-ts/src/activities/shared/use-file-system-file-from-url.ts @@ -6,8 +6,7 @@ import { finished } from "node:stream/promises"; import type { ReadableStream } from "node:stream/web"; import { fileURLToPath } from "node:url"; -import { getAwsS3Config } from "@local/hash-backend-utils/aws-config"; -import { AwsS3StorageProvider } from "@local/hash-backend-utils/file-storage/aws-s3-storage-provider"; +import { getStorageProvider } from "@local/hash-backend-utils/flows/payload-storage"; import type { HashEntity } from "@local/hash-graph-sdk/entity"; import { generateUuid } from "@local/hash-isomorphic-utils/generate-uuid"; import type { File } from "@local/hash-isomorphic-utils/system-types/shared"; @@ -65,11 +64,7 @@ export const useFileSystemPathFromEntity = async ( const filePath = `${baseFilePath}/${generateUuid()}.pdf`; - const s3Config = getAwsS3Config(); - - const downloadProvider = new AwsS3StorageProvider(s3Config); - - const urlForDownload = await downloadProvider.presignDownload({ + const urlForDownload = await getStorageProvider().presignDownload({ entity: fileEntity, expiresInSeconds: 60 * 60, key: storageKey, diff --git a/apps/hash-ai-worker-ts/src/workflows/run-flow-workflow.ts b/apps/hash-ai-worker-ts/src/workflows/run-flow-workflow.ts index ae6b91cdfe7..d77e3fe316d 100644 --- a/apps/hash-ai-worker-ts/src/workflows/run-flow-workflow.ts +++ b/apps/hash-ai-worker-ts/src/workflows/run-flow-workflow.ts @@ -31,6 +31,11 @@ const activitiesHandlingCancellation: FlowActivityId[] = [ "researchEntitiesAction", ]; +const activitiesHeartbeating: FlowActivityId[] = [ + ...activitiesHandlingCancellation, + "persistEntitiesAction", +]; + const proxyFlowActivity: ProxyFlowActivity< AiFlowActionDefinitionId, typeof createFlowActivities @@ -44,7 +49,7 @@ const proxyFlowActivity: ProxyFlowActivity< ? ActivityCancellationType.WAIT_CANCELLATION_COMPLETED : ActivityCancellationType.ABANDON, - startToCloseTimeout: activitiesHandlingCancellation.includes(actionName) + startToCloseTimeout: activitiesHeartbeating.includes(actionName) ? /** * @todo H-3129 – research tasks can take a long time, and waiting for user input takes an indefinite amount of time. * - we need to be able to sleep at the workflow level and have activities that take a bounded, shorter amount of time. @@ -64,7 +69,7 @@ const proxyFlowActivity: ProxyFlowActivity< * - heartbeats are throttled by default to 80% of the heartbeatTimeout, so sending a heartbeat does not mean it will be processed then * - maxHeartbeatThrottleInterval can be set in WorkerOptions, and otherwise defaults to 60s */ - heartbeatTimeout: activitiesHandlingCancellation.includes(actionName) + heartbeatTimeout: activitiesHeartbeating.includes(actionName) ? `${heartbeatTimeoutSeconds} second` : undefined, retry: { maximumAttempts }, diff --git a/apps/hash-api/src/ai/infer-entities-websocket.ts b/apps/hash-api/src/ai/infer-entities-websocket.ts index 85c4bf50fb8..1dbbc6417ab 100644 --- a/apps/hash-api/src/ai/infer-entities-websocket.ts +++ b/apps/hash-api/src/ai/infer-entities-websocket.ts @@ -2,6 +2,7 @@ import type http from "node:http"; import type { EntityUuid } from "@blockprotocol/type-system"; import type { DistributiveOmit } from "@local/advanced-types/distribute"; +import type { FileStorageProvider } from "@local/hash-backend-utils/file-storage"; import { getFlowRunEntityById, getFlowRuns, @@ -29,10 +30,12 @@ const inferEntitiesMessageHandler = async ({ graphApiClient, temporalClient, message, + storageProvider, user, }: { graphApiClient: GraphApi; socket: WebSocket; + storageProvider: FileStorageProvider; temporalClient: Client; message: DistributiveOmit; user: User; @@ -49,6 +52,7 @@ const inferEntitiesMessageHandler = async ({ graphApiClient, temporalClient, message, + storageProvider, user, }); return; @@ -67,6 +71,7 @@ const inferEntitiesMessageHandler = async ({ filters: { executionStatus: FlowRunStatus.Running }, graphApiClient, includeDetails: true, + storageProvider, temporalClient, }); @@ -112,11 +117,13 @@ export const openInferEntitiesWebSocket = ({ context, httpServer, logger, + storageProvider, temporalClient, }: { context: ImpureGraphContext; httpServer: http.Server; logger: Logger; + storageProvider: FileStorageProvider; temporalClient: Client; }) => { const wss = new WebSocketServer({ @@ -157,6 +164,7 @@ export const openInferEntitiesWebSocket = ({ void inferEntitiesMessageHandler({ graphApiClient: context.graphApi, socket, + storageProvider, temporalClient, message, user, diff --git a/apps/hash-api/src/ai/infer-entities-websocket/handle-infer-entities-request.ts b/apps/hash-api/src/ai/infer-entities-websocket/handle-infer-entities-request.ts index 0c563054284..16c7a5ab846 100644 --- a/apps/hash-api/src/ai/infer-entities-websocket/handle-infer-entities-request.ts +++ b/apps/hash-api/src/ai/infer-entities-websocket/handle-infer-entities-request.ts @@ -1,5 +1,6 @@ import type { DistributiveOmit } from "@local/advanced-types/distribute"; import { typedEntries } from "@local/advanced-types/typed-entries"; +import type { FileStorageProvider } from "@local/hash-backend-utils/file-storage"; import { getFlowRuns } from "@local/hash-backend-utils/flows"; import type { GraphApi } from "@local/hash-graph-client"; import type { @@ -29,11 +30,13 @@ import { FlowRunStatus } from "../../graphql/api-types.gen"; export const handleInferEntitiesRequest = async ({ graphApiClient, + storageProvider, temporalClient, message, user, }: { graphApiClient: GraphApi; + storageProvider: FileStorageProvider; temporalClient: Client; message: DistributiveOmit< | ManualInferenceWebsocketRequestMessage @@ -72,6 +75,7 @@ export const handleInferEntitiesRequest = async ({ }, graphApiClient, includeDetails: true, + storageProvider, temporalClient, }); diff --git a/apps/hash-api/src/graph/context-types.ts b/apps/hash-api/src/graph/context-types.ts index 4a6a4ce6866..3aa96e66a86 100644 --- a/apps/hash-api/src/graph/context-types.ts +++ b/apps/hash-api/src/graph/context-types.ts @@ -1,5 +1,5 @@ import type { ActorType, OriginProvenance } from "@blockprotocol/type-system"; -import type { UploadableStorageProvider } from "@local/hash-backend-utils/file-storage"; +import type { FileStorageProvider } from "@local/hash-backend-utils/file-storage"; import type { TemporalClient } from "@local/hash-backend-utils/temporal"; import type { GraphApi as GraphApiClient } from "@local/hash-graph-client"; import type { AuthenticationContext } from "@local/hash-graph-sdk/authentication-context"; @@ -16,8 +16,8 @@ export type ImpureGraphContext< origin: OriginProvenance; }; } & (RequiresUpload extends true - ? { uploadProvider: UploadableStorageProvider } - : { uploadProvider?: UploadableStorageProvider }) & + ? { uploadProvider: FileStorageProvider } + : { uploadProvider?: FileStorageProvider }) & (RequiresTemporal extends true ? { temporalClient: TemporalClient } : { temporalClient?: TemporalClient }); diff --git a/apps/hash-api/src/graphql/context.ts b/apps/hash-api/src/graphql/context.ts index e2e9839b859..253f450fca6 100644 --- a/apps/hash-api/src/graphql/context.ts +++ b/apps/hash-api/src/graphql/context.ts @@ -1,5 +1,5 @@ import type { ProvidedEntityEditionProvenance } from "@blockprotocol/type-system"; -import type { UploadableStorageProvider } from "@local/hash-backend-utils/file-storage"; +import type { FileStorageProvider } from "@local/hash-backend-utils/file-storage"; import type { Logger } from "@local/hash-backend-utils/logger"; import type { TemporalClient } from "@local/hash-backend-utils/temporal"; import type { VaultClient } from "@local/hash-backend-utils/vault"; @@ -16,7 +16,7 @@ import type { User } from "../graph/knowledge/system-types/user"; export interface GraphQLContext { dataSources: { graphApi: GraphApi; - uploadProvider: UploadableStorageProvider; + uploadProvider: FileStorageProvider; }; emailTransporter: EmailTransporter; logger: Logger; diff --git a/apps/hash-api/src/graphql/create-apollo-server.ts b/apps/hash-api/src/graphql/create-apollo-server.ts index 7586aaae6ff..719677c42d0 100644 --- a/apps/hash-api/src/graphql/create-apollo-server.ts +++ b/apps/hash-api/src/graphql/create-apollo-server.ts @@ -7,7 +7,7 @@ import { ApolloServerPluginLandingPageGraphQLPlayground } from "@apollo/server-p import { KeyvAdapter } from "@apollo/utils.keyvadapter"; import { expressMiddleware } from "@as-integrations/express5"; import { makeExecutableSchema } from "@graphql-tools/schema"; -import type { UploadableStorageProvider } from "@local/hash-backend-utils/file-storage"; +import type { FileStorageProvider } from "@local/hash-backend-utils/file-storage"; import type { Logger } from "@local/hash-backend-utils/logger"; import type { TemporalClient } from "@local/hash-backend-utils/temporal"; import type { VaultClient } from "@local/hash-backend-utils/vault"; @@ -113,7 +113,7 @@ const statsPlugin = ({ export interface CreateApolloServerParams { graphApi: GraphApi; cache: Keyv; - uploadProvider: UploadableStorageProvider; + uploadProvider: FileStorageProvider; temporalClient: TemporalClient; vaultClient?: VaultClient; emailTransporter: EmailTransporter; diff --git a/apps/hash-api/src/graphql/resolvers/flows/get-flow-run-by-id.ts b/apps/hash-api/src/graphql/resolvers/flows/get-flow-run-by-id.ts index 322c9978c02..ec1a00992ae 100644 --- a/apps/hash-api/src/graphql/resolvers/flows/get-flow-run-by-id.ts +++ b/apps/hash-api/src/graphql/resolvers/flows/get-flow-run-by-id.ts @@ -25,6 +25,7 @@ export const getFlowRunByIdResolver: ResolverFn< flowRunId: flowRunId as EntityUuid, graphApiClient: context.dataSources.graphApi, includeDetails, + storageProvider: context.dataSources.uploadProvider, temporalClient: context.temporal, userAuthentication: context.authentication, }); diff --git a/apps/hash-api/src/graphql/resolvers/flows/get-flow-runs.ts b/apps/hash-api/src/graphql/resolvers/flows/get-flow-runs.ts index e59258dff52..acb48745c01 100644 --- a/apps/hash-api/src/graphql/resolvers/flows/get-flow-runs.ts +++ b/apps/hash-api/src/graphql/resolvers/flows/get-flow-runs.ts @@ -29,6 +29,7 @@ export const getFlowRunsResolver: ResolverFn< }, graphApiClient: dataSources.graphApi, includeDetails, + storageProvider: dataSources.uploadProvider, temporalClient: temporal, }); }; diff --git a/apps/hash-api/src/index.ts b/apps/hash-api/src/index.ts index 9d17f2ba5c6..e1c2a18ddde 100644 --- a/apps/hash-api/src/index.ts +++ b/apps/hash-api/src/index.ts @@ -641,6 +641,7 @@ const main = async () => { context: machineActorContext, httpServer, logger, + storageProvider: uploadProvider, temporalClient, }); diff --git a/apps/hash-api/src/storage/index.ts b/apps/hash-api/src/storage/index.ts index 8446dfa8915..6547613b443 100644 --- a/apps/hash-api/src/storage/index.ts +++ b/apps/hash-api/src/storage/index.ts @@ -4,7 +4,6 @@ import { getAwsS3Config } from "@local/hash-backend-utils/aws-config"; import type { FileStorageProvider, StorageType, - UploadableStorageProvider, } from "@local/hash-backend-utils/file-storage"; import { isStorageType, @@ -37,9 +36,7 @@ const DOWNLOAD_URL_EXPIRATION_SECONDS = 60 * 60 * 24 * 7; // 1 hour. const DOWNLOAD_URL_CACHE_OFFSET_SECONDS = 60 * 60; -type StorageProviderInitialiser = ( - app: Express, -) => FileStorageProvider | UploadableStorageProvider; +type StorageProviderInitialiser = (app: Express) => FileStorageProvider; const storageProviderInitialiserLookup: Record< StorageType, @@ -68,20 +65,20 @@ export const initialiseStorageProvider = ( return newProvider; }; -export const getUploadStorageProvider = (): UploadableStorageProvider => { +export const getUploadStorageProvider = (): FileStorageProvider => { const uploadProvider = storageProviderLookup[uploadStorageProvider]; if (!uploadProvider) { throw new Error( `Upload storage provider ${uploadStorageProvider} is required by the app but doesn't exist`, ); } - return uploadProvider as UploadableStorageProvider; + return uploadProvider; }; export const setupStorageProviders = ( app: Express, fileUploadProvider: StorageType, -): UploadableStorageProvider => { +): FileStorageProvider => { initialiseStorageProvider(app, fileUploadProvider); uploadStorageProvider = fileUploadProvider; return getUploadStorageProvider(); diff --git a/apps/hash-api/src/storage/local-file-storage.ts b/apps/hash-api/src/storage/local-file-storage.ts index c3256cba93e..1c649c63da5 100644 --- a/apps/hash-api/src/storage/local-file-storage.ts +++ b/apps/hash-api/src/storage/local-file-storage.ts @@ -2,13 +2,15 @@ import fs from "node:fs"; import path from "node:path"; import { URL } from "node:url"; +import type { Url } from "@blockprotocol/type-system"; import type { + FileStorageProvider, GetFileEntityStorageKeyParams, + GetFlowOutputStorageKeyParams, PresignedDownloadRequest, PresignedPutUpload, PresignedStorageRequest, StorageType, - UploadableStorageProvider, } from "@local/hash-backend-utils/file-storage"; import type { File } from "@local/hash-isomorphic-utils/system-types/shared"; import appRoot from "app-root-path"; @@ -30,9 +32,7 @@ export interface LocalFileSystemStorageProviderConstructorArgs { * NOTE: NOT MEANT TO BE USED IN PRODUCTION * This storage provider is given as an easy to setup alternative to S3 file uploads for simple setups. */ -export class LocalFileSystemStorageProvider - implements UploadableStorageProvider -{ +export class LocalFileSystemStorageProvider implements FileStorageProvider { public storageType: StorageType = "LOCAL_FILE_SYSTEM"; private fileUploadPath: string; @@ -83,9 +83,9 @@ export class LocalFileSystemStorageProvider }; } - async presignDownload(params: PresignedDownloadRequest): Promise { + async presignDownload(params: PresignedDownloadRequest): Promise { return new URL(path.join(DOWNLOAD_BASE_URL, params.key), this.apiOrigin) - .href; + .href as Url; } getFileEntityStorageKey({ @@ -102,6 +102,58 @@ export class LocalFileSystemStorageProvider return `${folder}/${filename}` as const; } + /** + * Generate a storage key for flow output payloads. + * Format: flows/{workflowId}/{runId}/{stepId}/{outputName}.json + */ + getFlowOutputStorageKey({ + workflowId, + runId, + stepId, + outputName, + }: GetFlowOutputStorageKeyParams) { + return `flows/${workflowId}/${runId}/${stepId}/${outputName}.json` as const; + } + + /** + * Upload data directly to local storage without presigning. + */ + async uploadDirect({ + key, + body, + }: { + key: string; + body: string | Buffer; + contentType?: string; + }): Promise { + const filePath = path.join(this.fileUploadPath, path.normalize(key)); + + if (!filePath.startsWith(this.fileUploadPath)) { + throw new Error("Invalid key: path traversal detected"); + } + + // Ensure the directory exists + const dir = path.dirname(filePath); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } + + await fs.promises.writeFile(filePath, body); + } + + /** + * Download data directly from local storage without presigning. + */ + async downloadDirect({ key }: { key: string }): Promise { + const filePath = path.join(this.fileUploadPath, path.normalize(key)); + + if (!filePath.startsWith(this.fileUploadPath)) { + throw new Error("Invalid key: path traversal detected"); + } + + return fs.promises.readFile(filePath); + } + /** Sets up express routes required for uploading and downloading files */ setupExpressRoutes(app: Express) { app.put(UPLOAD_BASE_URL, async (req, res, _next) => { diff --git a/apps/hash-integration-worker/src/activities/flow-activities.ts b/apps/hash-integration-worker/src/activities/flow-activities.ts index 0ec2201860b..b43325410ee 100644 --- a/apps/hash-integration-worker/src/activities/flow-activities.ts +++ b/apps/hash-integration-worker/src/activities/flow-activities.ts @@ -12,6 +12,6 @@ export const createFlowActivities: CreateFlowActivities< }: { graphApiClient: GraphApi; }) => ({ - ...createAviationActivities(), + ...createAviationActivities({ graphApiClient }), ...createIntegrationActivities({ graphApiClient }), }); diff --git a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities.ts b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities.ts index 300c023e466..31dd8a6c05f 100644 --- a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities.ts +++ b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities.ts @@ -1,14 +1,34 @@ +import type { GraphApi } from "@local/hash-graph-client"; + +import { getHistoricalFlightArrivalsAction } from "./aviation-activities/get-historical-flight-arrivals-action.js"; +import { createGetLiveFlightPositionsAction } from "./aviation-activities/get-live-flight-positions-action.js"; import { getScheduledFlightsAction } from "./aviation-activities/get-scheduled-flights-action.js"; +export { getHistoricalFlightArrivalsAction } from "./aviation-activities/get-historical-flight-arrivals-action.js"; +export { createGetLiveFlightPositionsAction } from "./aviation-activities/get-live-flight-positions-action.js"; export { getScheduledFlightsAction } from "./aviation-activities/get-scheduled-flights-action.js"; export { createPersistIntegrationEntitiesAction as createPersistFlightEntitiesAction } from "./integration-activities/persist-integration-entities-action.js"; /** * Creates the aviation flow action activities. */ -export const createAviationActivities = () => ({ +export const createAviationActivities = ({ + graphApiClient, +}: { + graphApiClient: GraphApi; +}) => ({ + /** + * Fetches historical flight arrivals from AeroAPI for a date range and returns them as ProposedEntity objects. + */ + getHistoricalFlightArrivalsAction, /** * Fetches scheduled flights from AeroAPI and returns them as ProposedEntity objects. */ getScheduledFlightsAction, + /** + * Fetches live flight positions from FlightRadar24 for flights that have departed. + */ + getLiveFlightPositionsAction: createGetLiveFlightPositionsAction({ + graphApiClient, + }), }); diff --git a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-historical-flight-arrivals-action.ts b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-historical-flight-arrivals-action.ts new file mode 100644 index 00000000000..98df16c0217 --- /dev/null +++ b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-historical-flight-arrivals-action.ts @@ -0,0 +1,141 @@ +import { + extractBaseUrl, + type OriginProvenance, +} from "@blockprotocol/type-system"; +import type { IntegrationFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; +import { getHistoricalArrivalEntities } from "@local/hash-backend-utils/integrations/aviation"; +import { getSimplifiedIntegrationFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; +import type { ProposedEntity } from "@local/hash-isomorphic-utils/flows/types"; +import { systemEntityTypes } from "@local/hash-isomorphic-utils/ontology-type-ids"; +import { StatusCode } from "@local/status"; + +import { getFlowContext } from "../shared/get-integration-flow-context.js"; +import { aviationProposedEntityToFlowProposedEntity } from "./get-scheduled-flights-action.js"; + +/** + * Validates that the end date is yesterday or earlier. + * Historical flight data is only available for completed flights. + */ +const validateEndDate = (endDate: string): void => { + const endDateObj = new Date(`${endDate}T23:59:59Z`); + const today = new Date(); + today.setUTCHours(0, 0, 0, 0); + + if (endDateObj >= today) { + throw new Error( + `End date must be yesterday or earlier. Received: ${endDate}`, + ); + } +}; + +/** + * Validates that the start date is not after the end date. + */ +const validateDateRange = (startDate: string, endDate: string): void => { + const startDateObj = new Date(startDate); + const endDateObj = new Date(endDate); + + if (startDateObj > endDateObj) { + throw new Error( + `Start date (${startDate}) cannot be after end date (${endDate})`, + ); + } +}; + +/** + * Fetches historical arrival flights from AeroAPI for a given airport and date range + * and returns them as ProposedEntity objects. + */ +export const getHistoricalFlightArrivalsAction: IntegrationFlowActionActivity< + "getHistoricalFlightArrivals" +> = async ({ inputs }) => { + try { + const { airportIcao, startDate, endDate } = + getSimplifiedIntegrationFlowActionInputs({ + inputs, + actionType: "getHistoricalFlightArrivals", + }); + + // Validate inputs + validateEndDate(endDate); + validateDateRange(startDate, endDate); + + const { entities, provenance } = await getHistoricalArrivalEntities( + airportIcao, + startDate, + endDate, + ); + + const fullProvenance = { + ...provenance, + actorType: "machine" as const, + origin: { + type: "flow", + id: "aviation-integration", + } satisfies OriginProvenance, + }; + + const proposedEntities: ProposedEntity[] = []; + + let flightCount = 0; + for (const entity of entities.values()) { + if ( + entity.entityTypeIds.some( + (entityTypeId) => + extractBaseUrl(entityTypeId) === + systemEntityTypes.flight.entityTypeBaseUrl, + ) + ) { + flightCount++; + } + + proposedEntities.push( + aviationProposedEntityToFlowProposedEntity(entity, fullProvenance), + ); + } + + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const { workflowId, runId, stepId } = await getFlowContext(); + + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: proposedEntities, + }); + + return { + code: StatusCode.Ok, + message: `Generated ${flightCount} flights and ${entities.size - flightCount} related entities for ${airportIcao} from ${startDate} to ${endDate}`, + contents: [ + { + outputs: [ + { + outputName: "proposedEntities", + payload: { + kind: "ProposedEntity", + value: storedRef, + }, + }, + ], + }, + ], + }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : "Unknown error occurred"; + + return { + code: StatusCode.Internal, + message: `Failed to fetch historical flight arrivals: ${errorMessage}`, + contents: [], + }; + } +}; diff --git a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-live-flight-positions-action.ts b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-live-flight-positions-action.ts new file mode 100644 index 00000000000..9998c5c8d46 --- /dev/null +++ b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-live-flight-positions-action.ts @@ -0,0 +1,426 @@ +import { + getOutgoingLinksForEntity, + getRoots, +} from "@blockprotocol/graph/stdlib"; +import { + type EntityId, + extractEntityUuidFromEntityId, + type LinkEntity, + type OriginProvenance, + type PropertyMetadata, +} from "@blockprotocol/type-system"; +import type { IntegrationFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; +import { getFlightPositionProperties } from "@local/hash-backend-utils/integrations/aviation/flightradar24/client"; +import type { PrimaryKeyInput } from "@local/hash-backend-utils/integrations/aviation/shared/primary-keys"; +import type { GraphApi } from "@local/hash-graph-client"; +import { queryEntitySubgraph } from "@local/hash-graph-sdk/entity"; +import { getSimplifiedIntegrationFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; +import type { ProposedEntity } from "@local/hash-isomorphic-utils/flows/types"; +import { currentTimeInstantTemporalAxes } from "@local/hash-isomorphic-utils/graph-queries"; +import { + systemEntityTypes, + systemLinkEntityTypes, + systemPropertyTypes, +} from "@local/hash-isomorphic-utils/ontology-type-ids"; +import { + type ArrivesAt, + type ArrivesAtProperties, + type DepartsFrom, + type DepartsFromProperties, + type Flight, +} from "@local/hash-isomorphic-utils/system-types/flight"; +import type { + DateDataTypeMetadata, + TextDataTypeMetadata, +} from "@local/hash-isomorphic-utils/system-types/shared"; +import { StatusCode } from "@local/status"; + +import { getFlowContext } from "../shared/get-integration-flow-context.js"; +import { splitPropertiesAndMetadata } from "../shared/split-properties-and-metadata.js"; + +const TEN_MINUTES_MS = 10 * 60 * 1000; + +/** + * Determines if a flight should have its live position fetched based on: + * 1. Expected departure time has passed + * 2. There is no confirmed arrival time, or confirmed arrival time was in the last 10 minutes + */ +const shouldFetchLivePosition = ( + departsFromProperties: DepartsFromProperties, + arrivesAtProperties: ArrivesAtProperties, +): boolean => { + // Get departure time - prefer estimated, fall back to scheduled + const actualDepartureTime = + departsFromProperties[ + "https://hash.ai/@h/types/property-type/actual-gate-time/" + ]; + const estimatedDepartureTime = + departsFromProperties[ + "https://hash.ai/@h/types/property-type/estimated-gate-time/" + ]; + const scheduledDepartureTime = + departsFromProperties[ + "https://hash.ai/@h/types/property-type/scheduled-gate-time/" + ]; + + const expectedDepartureTime = + actualDepartureTime ?? estimatedDepartureTime ?? scheduledDepartureTime; + + const actualArrivalTime = + arrivesAtProperties[ + "https://hash.ai/@h/types/property-type/actual-gate-time/" + ]; + + const now = Date.now(); + + // Check condition 1: Expected departure time has passed + const departureHasPassed = + expectedDepartureTime && new Date(expectedDepartureTime).getTime() < now; + + // Check condition 2: No confirmed arrival, or arrival was in last 10 minutes + const noConfirmedArrival = !actualArrivalTime; + const arrivedInLastTenMinutes = + actualArrivalTime && + now - new Date(actualArrivalTime).getTime() < TEN_MINUTES_MS; + + return Boolean( + departureHasPassed && (noConfirmedArrival || arrivedInLastTenMinutes), + ); +}; + +/** + * Creates the get live flight positions action that fetches live positions + * for flights that have departed or recently arrived. + */ +export const createGetLiveFlightPositionsAction = ({ + graphApiClient, +}: { + graphApiClient: GraphApi; +}): IntegrationFlowActionActivity<"getLiveFlightPositions"> => { + return async ({ inputs }) => { + try { + const { flowEntityId, runId, stepId, userAuthentication, workflowId } = + await getFlowContext({ graphApiClient }); + + const { persistedEntities: persistedEntitiesInput } = + getSimplifiedIntegrationFlowActionInputs({ + inputs, + actionType: "getLiveFlightPositions", + }); + + // The input is a stored reference - resolve it + const persistedEntities = await resolvePayloadValue( + getStorageProvider(), + "PersistedEntitiesMetadata", + persistedEntitiesInput, + ); + + const flightEntityIds = persistedEntities.persistedEntities.map( + ({ entityId }) => entityId, + ); + + if (flightEntityIds.length === 0) { + const emptyStoredRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: [], + }); + + return { + code: StatusCode.Ok, + message: "No persisted entities to check for live positions", + contents: [ + { + outputs: [ + { + outputName: "proposedEntities", + payload: { + kind: "ProposedEntity", + value: emptyStoredRef, + }, + }, + ], + }, + ], + }; + } + + const { subgraph } = await queryEntitySubgraph( + { graphApi: graphApiClient }, + userAuthentication, + { + filter: { + any: flightEntityIds.map((entityId) => ({ + equal: [ + { path: ["uuid"] }, + { parameter: extractEntityUuidFromEntityId(entityId) }, + ], + })), + }, + traversalPaths: [ + { + edges: [ + { + kind: "has-left-entity", + direction: "incoming", + }, + ], + }, + ], + temporalAxes: currentTimeInstantTemporalAxes, + includeDrafts: false, + includePermissions: false, + }, + ); + + const rootEntities = getRoots(subgraph); + + const flightsToUpdate: Array<{ + entityId: EntityId; + flightNumber: string; + primaryKeyProperties: PrimaryKeyInput["flight"]; + flightNumberPropertyMetadata: PropertyMetadata; + flightDatePropertyMetadata: PropertyMetadata; + }> = []; + + for (const entity of rootEntities) { + const flightNumber = + entity.properties[ + "https://hash.ai/@h/types/property-type/flight-number/" + ]; + + if (!flightNumber) { + continue; + } + + const outgoingLinks = getOutgoingLinksForEntity( + subgraph, + entity.metadata.recordId.entityId, + ); + + const departsFromLink = outgoingLinks.find( + (link): link is LinkEntity => + link.metadata.entityTypeIds.includes( + systemLinkEntityTypes.departsFrom.linkEntityTypeId, + ), + ); + + const arrivesAtLink = outgoingLinks.find( + (link): link is LinkEntity => + link.metadata.entityTypeIds.includes( + systemLinkEntityTypes.arrivesAt.linkEntityTypeId, + ), + ); + + if ( + departsFromLink && + arrivesAtLink && + shouldFetchLivePosition( + departsFromLink.properties, + arrivesAtLink.properties, + ) + ) { + const flightNumberPropertyMetadata = entity.propertyMetadata([ + systemPropertyTypes.flightNumber.propertyTypeBaseUrl, + ]); + + const flightDatePropertyMetadata = entity.propertyMetadata([ + systemPropertyTypes.flightDate.propertyTypeBaseUrl, + ]); + + if (!flightNumberPropertyMetadata) { + throw new Error( + `Flight number property metadata not found for flight entity ${entity.metadata.recordId.entityId}`, + ); + } + + if (!flightDatePropertyMetadata) { + throw new Error( + `Flight date property metadata not found for flight entity ${entity.metadata.recordId.entityId}`, + ); + } + + flightsToUpdate.push({ + entityId: entity.metadata.recordId.entityId, + flightNumber, + primaryKeyProperties: { + flightNumber, + flightDate: + entity.properties[ + "https://hash.ai/@h/types/property-type/flight-date/" + ]!, + }, + flightNumberPropertyMetadata, + flightDatePropertyMetadata, + }); + } + } + + if (flightsToUpdate.length === 0) { + const emptyStoredRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: [], + }); + + return { + code: StatusCode.Ok, + message: "No flights require live position updates", + contents: [ + { + outputs: [ + { + outputName: "proposedEntities", + payload: { + kind: "ProposedEntity", + value: emptyStoredRef, + }, + }, + ], + }, + ], + }; + } + + // Fetch live positions for each flight + const proposedEntities: ProposedEntity[] = []; + let successCount = 0; + let notFoundCount = 0; + + for (const { + entityId, + flightNumber, + primaryKeyProperties, + flightNumberPropertyMetadata, + flightDatePropertyMetadata, + } of flightsToUpdate) { + const positionData = await getFlightPositionProperties(flightNumber); + + if (!positionData) { + notFoundCount++; + continue; + } + + const { properties, provenance: sourceProvenance } = positionData; + + const propertiesWithPrimaryKey: Partial< + Flight["propertiesWithMetadata"]["value"] + > = { + ...properties, + }; + + /** + * We need the primary key properties passed out of this action, + * because persistIntegrationEntities relies on them to match existing entities. + */ + for (const [propertyType, propertyValue] of Object.entries( + primaryKeyProperties, + )) { + switch (propertyType) { + case "flightNumber": + propertiesWithPrimaryKey[ + "https://hash.ai/@h/types/property-type/flight-number/" + ] = { + value: propertyValue, + metadata: + flightNumberPropertyMetadata as unknown as TextDataTypeMetadata, + }; + break; + case "flightDate": + propertiesWithPrimaryKey[ + "https://hash.ai/@h/types/property-type/flight-date/" + ] = { + value: propertyValue, + metadata: + flightDatePropertyMetadata as unknown as DateDataTypeMetadata, + }; + break; + default: + throw new Error( + `Unhandled primary key property type: ${propertyType}`, + ); + } + } + + const { properties: propertiesOnly, propertyMetadata } = + splitPropertiesAndMetadata({ + value: propertiesWithPrimaryKey, + }); + + const proposedEntity: ProposedEntity = { + claims: { + isSubjectOf: [], + isObjectOf: [], + }, + provenance: { + actorType: "machine", + origin: { + type: "flow", + id: flowEntityId, + stepIds: [stepId], + } satisfies OriginProvenance, + ...sourceProvenance, + }, + propertyMetadata, + localEntityId: entityId, + entityTypeIds: [systemEntityTypes.flight.entityTypeId], + properties: propertiesOnly, + }; + + proposedEntities.push(proposedEntity); + successCount++; + } + + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: proposedEntities, + }); + + return { + code: StatusCode.Ok, + message: `Fetched live positions for ${successCount} flights (${notFoundCount} not found in FlightRadar24)`, + contents: [ + { + outputs: [ + { + outputName: "proposedEntities", + payload: { + kind: "ProposedEntity", + value: storedRef, + }, + }, + ], + }, + ], + }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : "Unknown error occurred"; + + return { + code: StatusCode.Internal, + message: `Failed to fetch live flight positions: ${errorMessage}`, + contents: [], + }; + } + }; +}; diff --git a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-scheduled-flights-action.ts b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-scheduled-flights-action.ts index a7c2a7bc6cd..09b319fca7c 100644 --- a/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-scheduled-flights-action.ts +++ b/apps/hash-integration-worker/src/activities/flow-activities/aviation-activities/get-scheduled-flights-action.ts @@ -5,6 +5,10 @@ import { type ProvidedEntityEditionProvenance, } from "@blockprotocol/type-system"; import type { IntegrationFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import type { AviationProposedEntity } from "@local/hash-backend-utils/integrations/aviation"; import { getScheduledArrivalEntities } from "@local/hash-backend-utils/integrations/aviation"; import { getSimplifiedIntegrationFlowActionInputs } from "@local/hash-isomorphic-utils/flows/action-definitions"; @@ -12,6 +16,7 @@ import type { ProposedEntity } from "@local/hash-isomorphic-utils/flows/types"; import { systemEntityTypes } from "@local/hash-isomorphic-utils/ontology-type-ids"; import { StatusCode } from "@local/status"; +import { getFlowContext } from "../shared/get-integration-flow-context.js"; import { splitPropertiesAndMetadata } from "../shared/split-properties-and-metadata.js"; /** @@ -59,7 +64,7 @@ export const aviationProposedEntityToFlowProposedEntity = ( }; /** - * Fetches scheduled flights from AeroAPI for a given airport and date and returns them as ProposedEntity objects. + * Fetches scheduled arrival flights from AeroAPI for a given airport and date and returns them as ProposedEntity objects. */ export const getScheduledFlightsAction: IntegrationFlowActionActivity< "getScheduledFlights" @@ -103,6 +108,19 @@ export const getScheduledFlightsAction: IntegrationFlowActionActivity< ); } + // Store the proposed entities in S3 to avoid passing large payloads through Temporal + const { workflowId, runId, stepId } = await getFlowContext(); + + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "proposedEntities", + kind: "ProposedEntity", + value: proposedEntities, + }); + return { code: StatusCode.Ok, message: `Generated ${flightCount} flights and ${entities.size - flightCount} related entities for ${airportIcao} on ${date}`, @@ -113,7 +131,7 @@ export const getScheduledFlightsAction: IntegrationFlowActionActivity< outputName: "proposedEntities", payload: { kind: "ProposedEntity", - value: proposedEntities, + value: storedRef, }, }, ], diff --git a/apps/hash-integration-worker/src/activities/flow-activities/integration-activities/persist-integration-entities-action.ts b/apps/hash-integration-worker/src/activities/flow-activities/integration-activities/persist-integration-entities-action.ts index 0885cec853d..d58f09924ab 100644 --- a/apps/hash-integration-worker/src/activities/flow-activities/integration-activities/persist-integration-entities-action.ts +++ b/apps/hash-integration-worker/src/activities/flow-activities/integration-activities/persist-integration-entities-action.ts @@ -8,6 +8,11 @@ import { type WebId, } from "@blockprotocol/type-system"; import type { IntegrationFlowActionActivity } from "@local/hash-backend-utils/flows"; +import { + getStorageProvider, + resolvePayloadValue, + storePayload, +} from "@local/hash-backend-utils/flows/payload-storage"; import { generateEntityMatcher, generateLinkMatcher, @@ -31,7 +36,9 @@ import { currentTimeInstantTemporalAxes, generateVersionedUrlMatchingFilter, } from "@local/hash-isomorphic-utils/graph-queries"; +import { stringifyError } from "@local/hash-isomorphic-utils/stringify-error"; import { StatusCode } from "@local/status"; +import { Context } from "@temporalio/activity"; import { getFlowContext } from "../shared/get-integration-flow-context.js"; @@ -79,6 +86,42 @@ const findExistingEntity = async (params: { return entity ?? null; }; +/** + * Executes an array of async operations in parallel batches. + * + * This function processes items in batches, executing all operations within each batch + * concurrently via `Promise.all`. Heartbeats are sent at the start of each batch to + * signal activity progress to Temporal. + * + * **Error handling behavior:** + * - If an operation throws an error, `Promise.all` will reject and the entire batch fails. + * - For partial batch success, the operation callback should catch its own errors and + * return a result indicating success or failure (e.g., a discriminated union type). + * - Successfully completed batches are preserved even if a later batch fails. + * + * @param items - The items to process + * @param batchSize - Number of items to process in each batch + * @param operation - Async function to execute for each item. Should handle its own errors + * if partial batch success is desired. + * @returns Array of results from all operations + */ +const executeInBatches = async ( + items: T[], + batchSize: number, + operation: (item: T) => Promise, +): Promise => { + const results: R[] = []; + + for (let i = 0; i < items.length; i += batchSize) { + Context.current().heartbeat(); + const batch = items.slice(i, i + batchSize); + const batchResults = await Promise.all(batch.map(operation)); + results.push(...batchResults); + } + + return results; +}; + const findExistingLink = async (params: { authentication: { actorId: ActorEntityUuid }; graphApiClient: GraphApi; @@ -131,6 +174,26 @@ const findExistingLink = async (params: { return entity ? new HashLinkEntity(entity) : null; }; +const BATCH_SIZE = 100; + +type EntityToCreate = { + proposedEntity: ProposedEntity; + params: Parameters[2]; +}; + +type EntityToPatch = { + proposedEntity: ProposedEntity; + existingEntity: HashEntity; + propertyPatches: Parameters< + typeof HashEntity.prototype.patch + >[2]["propertyPatches"]; +}; + +type EntityUnchanged = { + proposedEntity: ProposedEntity; + existingEntity: HashEntity; +}; + /** * Persists proposed entities to the graph, creating new entities as needed. * Returns the mapping of local entity IDs to persisted entity IDs. @@ -162,7 +225,14 @@ const persistEntities = async (params: { (entity) => !entity.sourceEntityId && !entity.targetEntityId, ); + // Phase 1: Find existing entities and categorize operations + const entitiesToCreate: EntityToCreate[] = []; + const entitiesToPatch: EntityToPatch[] = []; + const unchangedEntities: EntityUnchanged[] = []; + for (const proposedEntity of nonLinkEntities) { + Context.current().heartbeat(); + try { const existingEntity = await findExistingEntity({ graphApiClient, @@ -180,36 +250,22 @@ const persistEntities = async (params: { const propertyPatches = patchesFromPropertyObjects({ oldProperties: existingEntity.properties, newProperties, + removeProperties: false, }); - const updatedEntity = - propertyPatches.length > 0 - ? await existingEntity.patch(graphApiClient, authentication, { - propertyPatches, - provenance: { - ...provenance, - sources: proposedEntity.provenance.sources, - }, - }) - : existingEntity; - - entityIdsByLocalId.set( - proposedEntity.localEntityId, - updatedEntity.metadata.recordId.entityId, - ); - - persistedEntitiesMetadata.push({ - entityId: updatedEntity.metadata.recordId.entityId, - operation: - propertyPatches.length > 0 - ? "update" - : "already-exists-as-proposed", - }); + if (propertyPatches.length > 0) { + entitiesToPatch.push({ + proposedEntity, + existingEntity, + propertyPatches, + }); + } else { + unchangedEntities.push({ proposedEntity, existingEntity }); + } } else { - const newEntity = await HashEntity.create( - graphApiClient, - authentication, - { + entitiesToCreate.push({ + proposedEntity, + params: { webId, draft: false, properties: mergePropertyObjectAndMetadata( @@ -222,25 +278,126 @@ const persistEntities = async (params: { }, entityTypeIds: proposedEntity.entityTypeIds, }, - ); + }); + } + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : "Unknown error"; + failedEntityProposals.push({ + proposedEntity, + message: `Failed to find existing entity: ${errorMessage}. ${stringifyError(error)}`, + }); + } + } + + // Phase 2: Handle unchanged entities (no API call needed) + for (const { proposedEntity, existingEntity } of unchangedEntities) { + entityIdsByLocalId.set( + proposedEntity.localEntityId, + existingEntity.metadata.recordId.entityId, + ); + persistedEntitiesMetadata.push({ + entityId: existingEntity.metadata.recordId.entityId, + operation: "already-exists-as-proposed", + }); + } + + // Phase 3: Batch create new entities in groups of BATCH_SIZE + for ( + let batchStartIndex = 0; + batchStartIndex < entitiesToCreate.length; + batchStartIndex += BATCH_SIZE + ) { + Context.current().heartbeat(); + const batch = entitiesToCreate.slice( + batchStartIndex, + batchStartIndex + BATCH_SIZE, + ); + + try { + const createdEntities = await HashEntity.createMultiple( + graphApiClient, + authentication, + batch.map((item) => item.params), + ); + + for ( + let entityIndexInBatch = 0; + entityIndexInBatch < createdEntities.length; + entityIndexInBatch++ + ) { + const proposedEntity = batch[entityIndexInBatch]!.proposedEntity; + const createdEntity = createdEntities[entityIndexInBatch]!; entityIdsByLocalId.set( proposedEntity.localEntityId, - newEntity.metadata.recordId.entityId, + createdEntity.metadata.recordId.entityId, ); - persistedEntitiesMetadata.push({ - entityId: newEntity.metadata.recordId.entityId, + entityId: createdEntity.metadata.recordId.entityId, operation: "create", }); } } catch (error) { + // If batch creation fails, add all entities in this batch to failed proposals const errorMessage = error instanceof Error ? error.message : "Unknown error"; - failedEntityProposals.push({ - proposedEntity, - message: `Failed to persist entity: ${errorMessage}`, - }); + for (const { proposedEntity } of batch) { + failedEntityProposals.push({ + proposedEntity, + message: `Failed to create entity in batch: ${errorMessage}. ${stringifyError(error)}`, + }); + } + } + } + + // Phase 4: Patch existing entities in parallel batches + if (entitiesToPatch.length > 0) { + const patchResults = await executeInBatches( + entitiesToPatch, + BATCH_SIZE, + async ({ proposedEntity, existingEntity, propertyPatches }) => { + try { + const updatedEntity = await existingEntity.patch( + graphApiClient, + authentication, + { + propertyPatches, + provenance: { + ...provenance, + sources: proposedEntity.provenance.sources, + }, + }, + ); + return { success: true as const, proposedEntity, updatedEntity }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : "Unknown error"; + return { + success: false as const, + proposedEntity, + error: `Failed to patch entity: ${errorMessage}. ${stringifyError(error)}`, + }; + } + }, + ); + + for (const result of patchResults) { + if (result.success) { + entityIdsByLocalId.set( + result.proposedEntity.localEntityId, + result.updatedEntity.metadata.recordId.entityId, + ); + persistedEntitiesMetadata.push({ + entityId: result.updatedEntity.metadata.recordId.entityId, + operation: "update", + }); + } else { + failedEntityProposals.push({ + proposedEntity: result.proposedEntity, + message: result.error, + }); + } } } @@ -251,6 +408,24 @@ const persistEntities = async (params: { }; }; +type LinkToCreate = { + proposedLink: ProposedEntity; + params: Parameters[2]; +}; + +type LinkToPatch = { + proposedLink: ProposedEntity; + existingLink: HashLinkEntity; + propertyPatches: Parameters< + typeof HashLinkEntity.prototype.patch + >[2]["propertyPatches"]; +}; + +type LinkUnchanged = { + proposedLink: ProposedEntity; + existingLink: HashLinkEntity; +}; + /** * Persists proposed links to the graph, creating new links where they don't exist. */ @@ -281,7 +456,14 @@ const persistLinks = async (params: { (entity) => entity.sourceEntityId && entity.targetEntityId, ); + // Phase 1: Resolve entity IDs and find existing links + const linksToCreate: LinkToCreate[] = []; + const linksToPatch: LinkToPatch[] = []; + const unchangedLinks: LinkUnchanged[] = []; + for (const proposedLink of linkEntities) { + Context.current().heartbeat(); + const { sourceEntityId, targetEntityId } = proposedLink; if (!sourceEntityId || !targetEntityId) { @@ -331,31 +513,22 @@ const persistLinks = async (params: { const propertyPatches = patchesFromPropertyObjects({ oldProperties: existingLink.properties, newProperties, + removeProperties: false, }); - const updatedLink = - propertyPatches.length > 0 - ? await existingLink.patch(graphApiClient, authentication, { - propertyPatches, - provenance: { - ...provenance, - sources: proposedLink.provenance.sources, - }, - }) - : existingLink; - - persistedEntitiesMetadata.push({ - entityId: updatedLink.metadata.recordId.entityId, - operation: - propertyPatches.length > 0 - ? "update" - : "already-exists-as-proposed", - }); + if (propertyPatches.length > 0) { + linksToPatch.push({ + proposedLink, + existingLink, + propertyPatches, + }); + } else { + unchangedLinks.push({ proposedLink, existingLink }); + } } else { - const newLink = await HashLinkEntity.create( - graphApiClient, - authentication, - { + linksToCreate.push({ + proposedLink, + params: { webId, draft: false, linkData: { @@ -372,11 +545,6 @@ const persistLinks = async (params: { }, entityTypeIds: proposedLink.entityTypeIds, }, - ); - - persistedEntitiesMetadata.push({ - entityId: newLink.metadata.recordId.entityId, - operation: "create", }); } } catch (error) { @@ -384,11 +552,108 @@ const persistLinks = async (params: { error instanceof Error ? error.message : "Unknown error"; failedEntityProposals.push({ proposedEntity: proposedLink, - message: `Failed to persist link: ${errorMessage}`, + message: `Failed to find existing link: ${errorMessage}. ${stringifyError(error)}`, }); } } + // Phase 2: Handle unchanged links (no API call needed) + for (const { existingLink } of unchangedLinks) { + persistedEntitiesMetadata.push({ + entityId: existingLink.metadata.recordId.entityId, + operation: "already-exists-as-proposed", + }); + } + + // Phase 3: Batch create new links in groups of BATCH_SIZE + for ( + let batchStartIndex = 0; + batchStartIndex < linksToCreate.length; + batchStartIndex += BATCH_SIZE + ) { + Context.current().heartbeat(); + const batch = linksToCreate.slice( + batchStartIndex, + batchStartIndex + BATCH_SIZE, + ); + + try { + const createdLinks = await HashLinkEntity.createMultiple( + graphApiClient, + authentication, + batch.map((item) => item.params), + ); + + for ( + let linkIndexInBatch = 0; + linkIndexInBatch < createdLinks.length; + linkIndexInBatch++ + ) { + const createdLink = createdLinks[linkIndexInBatch]!; + persistedEntitiesMetadata.push({ + entityId: createdLink.metadata.recordId.entityId, + operation: "create", + }); + } + } catch (error) { + // If batch creation fails, add all links in this batch to failed proposals + const errorMessage = + error instanceof Error ? error.message : "Unknown error"; + for (const { proposedLink } of batch) { + failedEntityProposals.push({ + proposedEntity: proposedLink, + message: `Failed to create link in batch: ${errorMessage}. ${stringifyError(error)}`, + }); + } + } + } + + // Phase 4: Patch existing links in parallel batches + if (linksToPatch.length > 0) { + const patchResults = await executeInBatches( + linksToPatch, + BATCH_SIZE, + async ({ proposedLink, existingLink, propertyPatches }) => { + try { + const updatedLink = await existingLink.patch( + graphApiClient, + authentication, + { + propertyPatches, + provenance: { + ...provenance, + sources: proposedLink.provenance.sources, + }, + }, + ); + return { success: true as const, updatedLink }; + } catch (error) { + const errorMessage = + error instanceof Error ? error.message : "Unknown error"; + return { + success: false as const, + proposedLink, + error: `Failed to patch link: ${errorMessage}. ${stringifyError(error)}`, + }; + } + }, + ); + + for (const result of patchResults) { + if (result.success) { + persistedEntitiesMetadata.push({ + entityId: result.updatedLink.metadata.recordId.entityId, + operation: "update", + }); + } else { + failedEntityProposals.push({ + proposedEntity: result.proposedLink, + message: result.error, + }); + } + } + } + return { persistedEntitiesMetadata, failedEntityProposals }; }; @@ -402,13 +667,26 @@ export const createPersistIntegrationEntitiesAction = ({ }): IntegrationFlowActionActivity<"persistIntegrationEntities"> => { return async ({ inputs }) => { try { - const { flowEntityId, stepId, userAuthentication, webId } = - await getFlowContext(); + const { + flowEntityId, + runId, + stepId, + userAuthentication, + webId, + workflowId, + } = await getFlowContext({ graphApiClient }); - const { proposedEntities } = getSimplifiedIntegrationFlowActionInputs({ - inputs, - actionType: "persistIntegrationEntities", - }); + const { proposedEntities: proposedEntitiesInput } = + getSimplifiedIntegrationFlowActionInputs({ + inputs, + actionType: "persistIntegrationEntities", + }); + + const proposedEntities = await resolvePayloadValue( + getStorageProvider(), + "ProposedEntity", + proposedEntitiesInput, + ); const provenance: ProvidedEntityEditionProvenance = { actorType: "machine", @@ -456,6 +734,17 @@ export const createPersistIntegrationEntitiesAction = ({ failedEntityProposals: allFailedProposals, }; + // Store the output in S3 to avoid passing large payloads through Temporal + const storedRef = await storePayload({ + storageProvider: getStorageProvider(), + workflowId, + runId, + stepId, + outputName: "persistedEntities", + kind: "PersistedEntitiesMetadata", + value: result, + }); + const code = allPersistedEntities.length > 0 ? StatusCode.Ok @@ -480,7 +769,7 @@ export const createPersistIntegrationEntitiesAction = ({ outputName: "persistedEntities", payload: { kind: "PersistedEntitiesMetadata", - value: result, + value: storedRef, }, }, ], diff --git a/apps/hash-integration-worker/src/activities/flow-activities/shared/get-integration-flow-context.ts b/apps/hash-integration-worker/src/activities/flow-activities/shared/get-integration-flow-context.ts index 57d3ed5e0d7..ef2aacfb550 100644 --- a/apps/hash-integration-worker/src/activities/flow-activities/shared/get-integration-flow-context.ts +++ b/apps/hash-integration-worker/src/activities/flow-activities/shared/get-integration-flow-context.ts @@ -1,120 +1,31 @@ -import type { - EntityId, - EntityUuid, - UserId, - WebId, -} from "@blockprotocol/type-system"; -import { entityIdFromComponents } from "@blockprotocol/type-system"; +import type { EntityId, UserId, WebId } from "@blockprotocol/type-system"; +import { + getBaseWorkflowParams, + getFlowContextCache, + getFlowEntityInfo, +} from "@local/hash-backend-utils/flows/get-flow-context"; import { createTemporalClient } from "@local/hash-backend-utils/temporal"; -import { parseHistoryItemPayload } from "@local/hash-backend-utils/temporal/parse-history-item-payload"; -import type { - BaseRunFlowWorkflowParams, - RunAiFlowWorkflowParams, -} from "@local/hash-isomorphic-utils/flows/temporal-types"; +import type { GraphApi } from "@local/hash-graph-client"; import { Context } from "@temporalio/activity"; import type { Client as TemporalClient } from "@temporalio/client"; -import type { MemoryCache } from "cache-manager"; -import { caching } from "cache-manager"; let _temporalClient: TemporalClient | undefined; -let _runFlowWorkflowParamsCache: MemoryCache | undefined; - -type PartialRunFlowWorkflowParams = Pick< - BaseRunFlowWorkflowParams, - "webId" | "userAuthentication" ->; - -const getCache = async () => { - _runFlowWorkflowParamsCache = - _runFlowWorkflowParamsCache ?? - (await caching("memory", { - max: 100, // 100 items - ttl: 10 * 60 * 1000, // 10 minutes - })); - return _runFlowWorkflowParamsCache; -}; - export const getTemporalClient = async () => { _temporalClient = _temporalClient ?? (await createTemporalClient()); return _temporalClient; }; -const getPartialRunFlowWorkflowParams = async (params: { +type BaseFlowContext = { + runId: string; + stepId: string; + userAuthentication: { actorId: UserId }; + webId: WebId; workflowId: string; -}): Promise => { - const { workflowId } = params; - - const runFlowWorkflowParamsCache = await getCache(); - - const cachedPartialRunFlowWorkflowParams = - await runFlowWorkflowParamsCache.get( - workflowId, - ); - - if (cachedPartialRunFlowWorkflowParams) { - return cachedPartialRunFlowWorkflowParams; - } - - const temporalClient = await getTemporalClient(); - const handle = temporalClient.workflow.getHandle(workflowId); - - const { events } = await handle.fetchHistory(); - - if (!events) { - throw new Error(`No events found for workflowId ${workflowId}`); - } - - const workflowExecutionStartedEventAttributes = - events.find((event) => event.workflowExecutionStartedEventAttributes) - ?.workflowExecutionStartedEventAttributes ?? undefined; - - if (!workflowExecutionStartedEventAttributes) { - throw new Error( - `No workflow execution started event attributes found for workflowId ${workflowId}`, - ); - } - - const inputs = parseHistoryItemPayload( - workflowExecutionStartedEventAttributes.input, - ); - - if (!inputs) { - throw new Error( - `No inputs found for workflowId ${workflowId} in the workflow execution started event`, - ); - } - - const [runFlowWorkflowParams] = inputs as RunAiFlowWorkflowParams[]; - - if (!runFlowWorkflowParams) { - throw new Error( - `No parameters of the "runFlow" workflow found for workflowId ${workflowId}`, - ); - } - - /** - * Avoid caching the entire `RunFlowWorkflowParams` object to reduce memory usage - * of the cache. - */ - const partialRunFlowWorkflowParams: PartialRunFlowWorkflowParams = { - userAuthentication: runFlowWorkflowParams.userAuthentication, - webId: runFlowWorkflowParams.webId, - }; - - await runFlowWorkflowParamsCache.set( - workflowId, - partialRunFlowWorkflowParams, - ); - - return partialRunFlowWorkflowParams; }; -type FlowContext = { +type FlowContextWithEntity = BaseFlowContext & { flowEntityId: EntityId; - stepId: string; - userAuthentication: { actorId: UserId }; - webId: WebId; }; /** @@ -123,28 +34,59 @@ type FlowContext = { * * This method must be called from a temporal activity that is * called within the `runFlow` temporal workflow. + * + * @param params.graphApiClient - The Graph API client to use for entity queries. + * Required to get `flowEntityId`. If not provided, `flowEntityId` will not be available. */ -export const getFlowContext = async (): Promise => { +export async function getFlowContext(params: { + graphApiClient: GraphApi; +}): Promise; +export async function getFlowContext(params?: { + graphApiClient?: undefined; +}): Promise; +export async function getFlowContext(params?: { + graphApiClient?: GraphApi; +}): Promise { const activityContext = Context.current(); + const { workflowId, runId } = activityContext.info.workflowExecution; - const { workflowId } = activityContext.info.workflowExecution; + const temporalClient = await getTemporalClient(); + const cache = await getFlowContextCache(); - const { userAuthentication, webId } = await getPartialRunFlowWorkflowParams({ + // Get base workflow params from Temporal history + const { userAuthentication, webId } = await getBaseWorkflowParams({ workflowId, + temporalClient, + cache, }); - const flowEntityId = entityIdFromComponents( - webId, - // Assumes the flow entity UUID is the same as the workflow ID - workflowId as EntityUuid, - ); - const { activityId: stepId } = Context.current().info; - return { + const baseContext: BaseFlowContext = { userAuthentication, - flowEntityId, + runId, webId, stepId, + workflowId, }; -}; + + // If graphApiClient is provided, query for the flow entity + if (params?.graphApiClient) { + // Query for the flow entity by workflowId property + // This is necessary because the entity UUID may not match the workflow ID + // Uses shared utility with retry logic for race condition handling + const { flowEntityId } = await getFlowEntityInfo({ + workflowId, + userAuthentication, + graphApiClient: params.graphApiClient, + cache, + }); + + return { + ...baseContext, + flowEntityId, + }; + } + + return baseContext; +} diff --git a/apps/hash-integration-worker/src/shared/heartbeats.ts b/apps/hash-integration-worker/src/shared/heartbeats.ts new file mode 100644 index 00000000000..9475c41f753 --- /dev/null +++ b/apps/hash-integration-worker/src/shared/heartbeats.ts @@ -0,0 +1,4 @@ +/** + * If set when proxying an activity, the period of time without a heartbeat after which the activity is considered to have failed. + */ +export const heartbeatTimeoutSeconds = 10; diff --git a/apps/hash-integration-worker/src/workflows/run-flow-workflow.ts b/apps/hash-integration-worker/src/workflows/run-flow-workflow.ts index f7d7136af55..f1b6b5ccb57 100644 --- a/apps/hash-integration-worker/src/workflows/run-flow-workflow.ts +++ b/apps/hash-integration-worker/src/workflows/run-flow-workflow.ts @@ -11,6 +11,17 @@ import { } from "@temporalio/workflow"; import type { createFlowActivities } from "../activities/flow-activities.js"; +import { heartbeatTimeoutSeconds } from "../shared/heartbeats.js"; + +type FlowActivityId = keyof ReturnType; + +/** + * Activities which send a frequent heartbeat to ensure they are known to be still running, + * allowing for startToCloseTimeout to be longer in favour of a short heartbeatTimeout. + */ +const activitiesHeartbeating: FlowActivityId[] = [ + "persistIntegrationEntitiesAction", +]; const proxyFlowActivity: ProxyFlowActivity< IntegrationFlowActionDefinitionId, @@ -23,7 +34,21 @@ const proxyFlowActivity: ProxyFlowActivity< >({ cancellationType: ActivityCancellationType.ABANDON, - startToCloseTimeout: "300 second", + startToCloseTimeout: activitiesHeartbeating.includes(actionName) + ? "36000 second" // 10 hours + : "300 second", // 5 minutes + + /** + * The heartbeat timeout is the time elapsed without a heartbeat after which the activity is considered to have failed. + * Note that: + * - heartbeat-ing activities can receive notification when a flow is cancelled/closed, by catching Context.current().cancelled + * - notification will only be received when the next heartbeat is processed, and so the activity should heartbeat frequently + * - heartbeats are throttled by default to 80% of the heartbeatTimeout, so sending a heartbeat does not mean it will be processed then + * - maxHeartbeatThrottleInterval can be set in WorkerOptions, and otherwise defaults to 60s + */ + heartbeatTimeout: activitiesHeartbeating.includes(actionName) + ? `${heartbeatTimeoutSeconds} second` + : undefined, retry: { maximumAttempts }, activityId, diff --git a/apps/plugin-browser/src/pages/popup/popup-contents/action-center/shared/use-flow-runs.ts b/apps/plugin-browser/src/pages/popup/popup-contents/action-center/shared/use-flow-runs.ts index 2d0efd92b8d..edfc75e6402 100644 --- a/apps/plugin-browser/src/pages/popup/popup-contents/action-center/shared/use-flow-runs.ts +++ b/apps/plugin-browser/src/pages/popup/popup-contents/action-center/shared/use-flow-runs.ts @@ -36,9 +36,16 @@ const mapFlowRunToMinimalFlowRun = ( outputName === ("persistedEntities" satisfies (typeof browserInferenceFlowOutput)["name"]) ) { - return ( - payload.value as PayloadKindValues[(typeof browserInferenceFlowOutput)["payloadKind"]] - ).persistedEntities; + /** + * The GraphQL layer resolves StoredPayloadRef values before returning them to clients, + * so payload.value is the actual resolved value at runtime, not a StoredPayloadRef. + * The type system includes StoredPayloadRef as a possible value type, but by the time + * the response reaches the browser, these have been resolved. + * @see libs/@local/hash-backend-utils/src/flows/get-flow-run-details.ts + */ + const resolvedValue = + payload.value as PayloadKindValues[(typeof browserInferenceFlowOutput)["payloadKind"]]; + return resolvedValue.persistedEntities; } return []; }), diff --git a/libs/@blockprotocol/graph/src/codegen/preprocess.ts b/libs/@blockprotocol/graph/src/codegen/preprocess.ts index d9e3aaba38c..3c8a99ff9ed 100644 --- a/libs/@blockprotocol/graph/src/codegen/preprocess.ts +++ b/libs/@blockprotocol/graph/src/codegen/preprocess.ts @@ -1,11 +1,13 @@ import type { PreprocessContext } from "./context.js"; import { identifyLinkEntityTypes } from "./preprocess/identify-link-entity-types.js"; import { removeEmptyAllOfs } from "./preprocess/remove-empty-all-ofs.js"; +import { removeRedundantDataTypeInheritance } from "./preprocess/remove-redundant-data-type-inheritance.js"; import { rewriteTypeTitles } from "./preprocess/transform-type-titles.js"; export const preprocess = (context: PreprocessContext) => { rewriteTypeTitles(context); removeEmptyAllOfs(context); + removeRedundantDataTypeInheritance(context); identifyLinkEntityTypes(context); /* @todo - if properties are empty, remove the `allOf` */ }; diff --git a/libs/@blockprotocol/graph/src/codegen/preprocess/remove-redundant-data-type-inheritance.ts b/libs/@blockprotocol/graph/src/codegen/preprocess/remove-redundant-data-type-inheritance.ts new file mode 100644 index 00000000000..66f15d0b273 --- /dev/null +++ b/libs/@blockprotocol/graph/src/codegen/preprocess/remove-redundant-data-type-inheritance.ts @@ -0,0 +1,44 @@ +import { typedValues } from "../../util/typed-entries.js"; +import type { PreprocessContext } from "../context.js"; + +/** + * Checks if a data type schema has an enum constraint. + * + * Data types with enum constraints specify exact literal values, making + * parent type references redundant for TypeScript generation purposes. + */ +const hasEnumConstraint = (dataType: object): boolean => { + return "enum" in dataType && Array.isArray(dataType.enum); +}; + +/** + * Removes the `allOf` field from data types that have enum constraints. + * + * When a data type inherits from a parent type (via `allOf`) but also specifies + * an `enum` constraint, the json-schema-to-typescript library generates an + * intersection type like `TextDataType & ("Value1" | "Value2")`. + * + * Since the enum values already fully describe the valid value space (and are + * implicitly compatible with the parent type), we can remove the `allOf` to + * produce cleaner TypeScript like `"Value1" | "Value2"`. + * + * This transformation only affects TypeScript generation – the original + * semantic inheritance relationship is preserved in the type system. + */ +export const removeRedundantDataTypeInheritance = ( + context: PreprocessContext, +) => { + context.logDebug( + "Removing redundant inheritance from data types with enum constraints", + ); + + for (const dataType of typedValues(context.dataTypes)) { + if (hasEnumConstraint(dataType) && "allOf" in dataType) { + context.logTrace( + `Removing allOf from ${dataType.$id} as it has enum constraints`, + ); + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access -- intentionally modifying the schema + delete (dataType as any).allOf; + } + } +}; diff --git a/libs/@local/graph/sdk/typescript/src/entity.ts b/libs/@local/graph/sdk/typescript/src/entity.ts index bee24643780..82e24830a52 100644 --- a/libs/@local/graph/sdk/typescript/src/entity.ts +++ b/libs/@local/graph/sdk/typescript/src/entity.ts @@ -497,15 +497,18 @@ export const propertyObjectToPatches = ( * * @deprecated this is a function for migration purposes only. * For new code, track which properties are actually changed where they are changed, and create the patch operations - * directly. IF you use this, bear in mind that newProperties MUST represent ALL the properties that the entity will + * directly. IF you use this, bear in mind that if removeProperties is true, + * newProperties MUST represent ALL the properties that the entity will * have after the patch. Any properties not specified in newProperties will be removed. */ export const patchesFromPropertyObjects = ({ oldProperties, newProperties, + removeProperties = true, }: { oldProperties: PropertyObject; newProperties: PropertyObjectWithMetadata; + removeProperties?: boolean; }): PropertyPatchOperation[] => { const patches: PropertyPatchOperation[] = []; @@ -528,12 +531,14 @@ export const patchesFromPropertyObjects = ({ } } - for (const key of typedKeys(oldProperties)) { - if (typeof newProperties.value[key] === "undefined") { - patches.push({ - op: "remove", - path: [key], - }); + if (removeProperties) { + for (const key of typedKeys(oldProperties)) { + if (typeof newProperties.value[key] === "undefined") { + patches.push({ + op: "remove", + path: [key], + }); + } } } diff --git a/libs/@local/hash-backend-utils/package.json b/libs/@local/hash-backend-utils/package.json index 26ca529137d..452d194cc34 100644 --- a/libs/@local/hash-backend-utils/package.json +++ b/libs/@local/hash-backend-utils/package.json @@ -49,6 +49,7 @@ "@temporalio/workflow": "1.12.1", "agentkeepalive": "4.6.0", "axios": "1.12.2", + "cache-manager": "5.7.6", "dotenv-flow": "3.3.0", "exponential-backoff": "3.1.3", "googleapis": "133.0.0", diff --git a/libs/@local/hash-backend-utils/src/file-storage.ts b/libs/@local/hash-backend-utils/src/file-storage.ts index 5a97e0b7c7c..e9571509d41 100644 --- a/libs/@local/hash-backend-utils/src/file-storage.ts +++ b/libs/@local/hash-backend-utils/src/file-storage.ts @@ -1,4 +1,4 @@ -import type { EntityId, VersionedUrl } from "@blockprotocol/type-system"; +import type { EntityId, Url, VersionedUrl } from "@blockprotocol/type-system"; import type { HashEntity } from "@local/hash-graph-sdk/entity"; import { apiOrigin } from "@local/hash-isomorphic-utils/environment"; import { systemEntityTypes } from "@local/hash-isomorphic-utils/ontology-type-ids"; @@ -14,7 +14,7 @@ export const isStorageType = ( /** Helper type to create a typed "dictionary" of storage types to their storage provider instance */ export type StorageProviderLookup = Partial< - Record + Record >; /** @@ -23,20 +23,6 @@ export type StorageProviderLookup = Partial< */ export const storageProviderLookup: StorageProviderLookup = {}; -/** Interface describing a generic storage provider - * used for allowing the download and upload files via presigned request. - * The storage provider doesn't upload the file itself, instead it returns a URL - * and form-data fields for the client to upload their file to. - */ -export interface FileStorageProvider { - storageType: StorageType; - /** - * Presigns a file download request for a client to later download a file - * @return {string} The download URL to access the file - */ - presignDownload(params: PresignedDownloadRequest): Promise; -} - export interface GetFileEntityStorageKeyParams { entityId: EntityId; editionIdentifier: string; @@ -47,7 +33,27 @@ export type FileStorageKey = `${ | `${string}/` // optional path prefix | ""}${EntityId}/${string}/${string}`; -export interface UploadableStorageProvider extends FileStorageProvider { +export interface GetFlowOutputStorageKeyParams { + workflowId: string; + runId: string; + stepId: string; + outputName: string; +} + +/** + * Interface describing a storage provider for file upload/download operations. + * Supports both presigned URLs for client-side operations and direct access + * for services with storage credentials. + */ +export interface FileStorageProvider { + storageType: StorageType; + + /** + * Presigns a file download request for a client to later download a file + * @return {string} The download URL to access the file + */ + presignDownload(params: PresignedDownloadRequest): Promise; + /** * Presigns a file upload request for a client to later upload a file * @return Promise contains the presignedPut object with the url to PUT the file to, and the file storage @@ -81,6 +87,34 @@ export interface UploadableStorageProvider extends FileStorageProvider { this: void, params: GetFileEntityStorageKeyParams, ): FileStorageKey; + + /** + * Generate a storage key for flow output payloads. + * Format: flows/{workflowId}/{runId}/{stepId}/{outputName}.json + */ + getFlowOutputStorageKey( + this: void, + params: GetFlowOutputStorageKeyParams, + ): string; + + /** + * Upload data directly to storage without presigning. + * Used by workers that have direct storage credentials. + */ + uploadDirect( + this: void, + params: { + key: string; + body: string | Buffer; + contentType?: string; + }, + ): Promise; + + /** + * Download data directly from storage without presigning. + * Used by workers that have direct storage credentials. + */ + downloadDirect(this: void, params: { key: string }): Promise; } /** Parameters needed to allow the storage of a file */ diff --git a/libs/@local/hash-backend-utils/src/file-storage/aws-s3-storage-provider.ts b/libs/@local/hash-backend-utils/src/file-storage/aws-s3-storage-provider.ts index d266e4677ba..64aa88d0456 100644 --- a/libs/@local/hash-backend-utils/src/file-storage/aws-s3-storage-provider.ts +++ b/libs/@local/hash-backend-utils/src/file-storage/aws-s3-storage-provider.ts @@ -10,11 +10,12 @@ import { simplifyProperties } from "@local/hash-isomorphic-utils/simplify-proper import type { File } from "@local/hash-isomorphic-utils/system-types/shared"; import type { + FileStorageProvider, GetFileEntityStorageKeyParams, + GetFlowOutputStorageKeyParams, PresignedDownloadRequest, PresignedStorageRequest, StorageType, - UploadableStorageProvider, } from "../file-storage.js"; export interface AwsS3StorageProviderConstructorArgs { @@ -26,7 +27,7 @@ export interface AwsS3StorageProviderConstructorArgs { } /** Implementation of the storage provider for AWS S3. Uploads all files to a single bucket */ -export class AwsS3StorageProvider implements UploadableStorageProvider { +export class AwsS3StorageProvider implements FileStorageProvider { /** The S3 client is created in the constructor and kept as long as the instance lives */ private client: S3Client; private bucket: string; @@ -192,4 +193,65 @@ export class AwsS3StorageProvider implements UploadableStorageProvider { }: GetFileEntityStorageKeyParams) { return `files/${entityId}/${editionIdentifier}/${filename}` as const; } + + /** + * Generate a storage key for flow output payloads. + * Format: flows/{workflowId}/{runId}/{stepId}/{outputName}.json + */ + getFlowOutputStorageKey({ + workflowId, + runId, + stepId, + outputName, + }: GetFlowOutputStorageKeyParams) { + return `flows/${workflowId}/${runId}/${stepId}/${outputName}.json` as const; + } + + /** + * Upload data directly to S3 without presigning. + * Used by workers that have direct S3 credentials. + */ + async uploadDirect({ + key, + body, + contentType = "application/json", + }: { + key: string; + body: string | Buffer; + contentType?: string; + }): Promise { + const command = new PutObjectCommand({ + Bucket: this.bucket, + Key: key, + Body: body, + ContentType: contentType, + }); + + await this.client.send(command); + } + + /** + * Download data directly from S3 without presigning. + * Used by workers that have direct S3 credentials. + */ + async downloadDirect({ key }: { key: string }): Promise { + const command = new GetObjectCommand({ + Bucket: this.bucket, + Key: key, + }); + + const response = await this.client.send(command); + + if (!response.Body) { + throw new Error(`No body returned for S3 key: ${key}`); + } + + // Convert the readable stream to a Buffer + const chunks: Uint8Array[] = []; + for await (const chunk of response.Body as AsyncIterable) { + chunks.push(chunk); + } + + return Buffer.concat(chunks); + } } diff --git a/libs/@local/hash-backend-utils/src/flows.ts b/libs/@local/hash-backend-utils/src/flows.ts index 4445272021d..b355d36546a 100644 --- a/libs/@local/hash-backend-utils/src/flows.ts +++ b/libs/@local/hash-backend-utils/src/flows.ts @@ -26,6 +26,7 @@ import { } from "@local/hash-isomorphic-utils/ontology-type-ids"; import type { FlowRun as FlowRunEntity } from "@local/hash-isomorphic-utils/system-types/shared"; +import type { FileStorageProvider } from "./file-storage.js"; import { getFlowRunFromTemporalWorkflowId, getSparseFlowRunFromTemporalWorkflowId, @@ -49,6 +50,7 @@ type GetFlowRunByIdFnArgs = { flowRunId: EntityUuid; includeDetails: IncludeDetails; graphApiClient: GraphApi; + storageProvider: FileStorageProvider; temporalClient: TemporalClient; userAuthentication: { actorId: ActorEntityUuid }; }; @@ -69,6 +71,7 @@ export async function getFlowRunById({ flowRunId, includeDetails, graphApiClient, + storageProvider, temporalClient, userAuthentication, }: GetFlowRunByIdFnArgs): Promise { @@ -104,6 +107,7 @@ export async function getFlowRunById({ return getFlowRunFromTemporalWorkflowId({ flowRunId: entityUuid, name, + storageProvider, temporalClient, temporalWorkflowId, webId, @@ -137,6 +141,7 @@ type GetFlowRunsFnArgs = { filters: GetFlowRunsFilters; includeDetails: IncludeDetails; graphApiClient: GraphApi; + storageProvider: FileStorageProvider; temporalClient: TemporalClient; }; @@ -164,6 +169,7 @@ export async function getFlowRuns({ filters, graphApiClient, includeDetails, + storageProvider, temporalClient, }: GetFlowRunsFnArgs): Promise { const temporalWorkflowIdToFlowDetails = await queryEntities( @@ -245,13 +251,6 @@ export async function getFlowRuns({ )}"`; } - /** - * Order by StartTime DESC so that the latest run for each workflowId comes first. - * This allows the `workflowIdToLatestRunTime` logic below to correctly skip older runs - * (e.g. from workflow resets) by only recording the first (latest) start time we see. - */ - query += ` ORDER BY StartTime DESC`; - const workflowIterable = temporalClient.workflow.list({ query }); const workflowIdToLatestRunTime: Record = {}; @@ -287,6 +286,7 @@ export async function getFlowRuns({ const runInfo = await getFlowRunFromTemporalWorkflowId({ flowRunId: flowDetails.flowRunId, name: flowDetails.name, + storageProvider, temporalClient, temporalWorkflowId: flowDetails.temporalWorkflowId, webId: flowDetails.webId, diff --git a/libs/@local/hash-backend-utils/src/flows/get-flow-context.ts b/libs/@local/hash-backend-utils/src/flows/get-flow-context.ts new file mode 100644 index 00000000000..6a1461f7ed1 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/flows/get-flow-context.ts @@ -0,0 +1,203 @@ +import type { EntityId, UserId, WebId } from "@blockprotocol/type-system"; +import type { GraphApi } from "@local/hash-graph-client"; +import { queryEntities } from "@local/hash-graph-sdk/entity"; +import type { RunAiFlowWorkflowParams } from "@local/hash-isomorphic-utils/flows/temporal-types"; +import { + currentTimeInstantTemporalAxes, + generateVersionedUrlMatchingFilter, +} from "@local/hash-isomorphic-utils/graph-queries"; +import { + systemEntityTypes, + systemPropertyTypes, +} from "@local/hash-isomorphic-utils/ontology-type-ids"; +import type { FlowRun as FlowRunEntity } from "@local/hash-isomorphic-utils/system-types/shared"; +import type { Client as TemporalClient } from "@temporalio/client"; +import type { MemoryCache } from "cache-manager"; +import { caching } from "cache-manager"; +import { backOff } from "exponential-backoff"; + +import { parseHistoryItemPayload } from "../temporal/parse-history-item-payload.js"; + +let _flowContextCache: MemoryCache | undefined; + +/** + * Get the shared memory cache for flow context data. + * Creates the cache lazily on first access. + */ +export const getFlowContextCache = async (): Promise => { + _flowContextCache = + _flowContextCache ?? + (await caching("memory", { + max: 100, // 100 items + ttl: 10 * 60 * 1000, // 10 minutes + })); + return _flowContextCache; +}; + +export type FlowEntityInfo = { + flowEntityId: EntityId; +}; + +/** + * Query for the flow entity by workflowId property with retry logic. + * The workflowId is stored as a property on the FlowRun entity. + * Results are cached to avoid repeated queries. + * + * Includes retry logic to handle the race condition where an activity + * starts executing before the FlowRun entity has been persisted. + * + * @param params.workflowId - The Temporal workflow ID + * @param params.userAuthentication - Authentication context for the query + * @param params.graphApiClient - The Graph API client to use + * @param params.cache - Optional cache instance (defaults to shared cache) + */ +export const getFlowEntityInfo = async (params: { + workflowId: string; + userAuthentication: { actorId: UserId }; + graphApiClient: GraphApi; + cache?: MemoryCache; +}): Promise => { + const { workflowId, userAuthentication, graphApiClient } = params; + + const cache = params.cache ?? (await getFlowContextCache()); + const cacheKey = `flowEntity-${workflowId}`; + + const cachedInfo = await cache.get(cacheKey); + if (cachedInfo) { + return cachedInfo; + } + + // Query for the flow entity using the workflowId property + // Use backOff to handle the race condition where the entity might not be persisted yet + const flowEntity = await backOff( + async () => { + const { + entities: [entity], + } = await queryEntities( + { graphApi: graphApiClient }, + userAuthentication, + { + filter: { + all: [ + { + equal: [ + { + path: [ + "properties", + systemPropertyTypes.workflowId.propertyTypeBaseUrl, + ], + }, + { parameter: workflowId }, + ], + }, + generateVersionedUrlMatchingFilter( + systemEntityTypes.flowRun.entityTypeId, + { ignoreParents: true }, + ), + ], + }, + temporalAxes: currentTimeInstantTemporalAxes, + includeDrafts: false, + includePermissions: false, + }, + ); + + if (!entity) { + throw new Error( + `Flow entity not found for workflowId ${workflowId}. The flow entity may not have been persisted yet.`, + ); + } + + return entity; + }, + { + numOfAttempts: 5, + startingDelay: 500, + maxDelay: 5000, + jitter: "full", + }, + ); + + const flowEntityInfo: FlowEntityInfo = { + flowEntityId: flowEntity.metadata.recordId.entityId, + }; + + await cache.set(cacheKey, flowEntityInfo); + return flowEntityInfo; +}; + +/** + * Base workflow params that are common to both AI and integration flows. + */ +export type BaseWorkflowParams = { + webId: WebId; + userAuthentication: { actorId: UserId }; +}; + +/** + * Parse base workflow params from Temporal workflow history. + * Results are cached to avoid repeated Temporal API calls. + * + * @param params.workflowId - The Temporal workflow ID + * @param params.temporalClient - The Temporal client to use + * @param params.cache - Optional cache instance (defaults to shared cache) + */ +export const getBaseWorkflowParams = async (params: { + workflowId: string; + temporalClient: TemporalClient; + cache?: MemoryCache; +}): Promise => { + const { workflowId, temporalClient } = params; + + const cache = params.cache ?? (await getFlowContextCache()); + const cacheKey = `workflowParams-${workflowId}`; + + const cachedParams = await cache.get(cacheKey); + if (cachedParams) { + return cachedParams; + } + + const handle = temporalClient.workflow.getHandle(workflowId); + + const { events } = await handle.fetchHistory(); + + if (!events) { + throw new Error(`No events found for workflowId ${workflowId}`); + } + + const workflowExecutionStartedEventAttributes = + events.find((event) => event.workflowExecutionStartedEventAttributes) + ?.workflowExecutionStartedEventAttributes ?? undefined; + + if (!workflowExecutionStartedEventAttributes) { + throw new Error( + `No workflow execution started event attributes found for workflowId ${workflowId}`, + ); + } + + const inputs = parseHistoryItemPayload( + workflowExecutionStartedEventAttributes.input, + ); + + if (!inputs) { + throw new Error( + `No inputs found for workflowId ${workflowId} in the workflow execution started event`, + ); + } + + const [runFlowWorkflowParams] = inputs as RunAiFlowWorkflowParams[]; + + if (!runFlowWorkflowParams) { + throw new Error( + `No parameters of the "runFlow" workflow found for workflowId ${workflowId}`, + ); + } + + const baseParams: BaseWorkflowParams = { + userAuthentication: runFlowWorkflowParams.userAuthentication, + webId: runFlowWorkflowParams.webId, + }; + + await cache.set(cacheKey, baseParams); + return baseParams; +}; diff --git a/libs/@local/hash-backend-utils/src/flows/get-flow-run-details.ts b/libs/@local/hash-backend-utils/src/flows/get-flow-run-details.ts index 29262b4cb36..e981e60149d 100644 --- a/libs/@local/hash-backend-utils/src/flows/get-flow-run-details.ts +++ b/libs/@local/hash-backend-utils/src/flows/get-flow-run-details.ts @@ -8,8 +8,14 @@ import type { FlowInputs, FlowSignalType, ProgressLogSignal, + ResolvedPayload, + ResolvedStepOutput, + ResolvedStepRunOutput, SparseFlowRun, + StepOutput, + StepRunOutput, } from "@local/hash-isomorphic-utils/flows/types"; +import { isStoredPayloadRef } from "@local/hash-isomorphic-utils/flows/types"; import type { FlowRun, FlowRunStatus, @@ -24,11 +30,64 @@ import { } from "@temporalio/common"; import proto from "@temporalio/proto"; +import type { FileStorageProvider } from "../file-storage.js"; import { temporalNamespace } from "../temporal.js"; import { parseHistoryItemPayload } from "../temporal/parse-history-item-payload.js"; +import { retrievePayload } from "./payload-storage.js"; type IHistoryEvent = proto.temporal.api.history.v1.IHistoryEvent; +/** + * Cache for resolved payloads to avoid re-downloading the same S3 objects. + * Keyed by S3 storage key. + */ +type PayloadCache = Map; + +/** + * Resolve any stored payload references in step outputs. + * This downloads the actual payload data from S3 and replaces the reference. + * + * @param outputs - The step outputs to resolve + * @param storageProvider - The storage provider to retrieve payloads from + * @param cache - Optional cache to avoid re-downloading the same S3 objects + */ +const resolveStoredPayloadsInOutputs = async ( + outputs: StepOutput[] | undefined, + storageProvider: FileStorageProvider, + cache?: PayloadCache, +): Promise => { + if (!outputs) { + return outputs; + } + + return Promise.all( + outputs.map(async (output) => { + const { payload } = output; + + if (isStoredPayloadRef(payload.value)) { + const storageKey = payload.value.storageKey; + + // Check cache first + let resolvedValue = cache?.get(storageKey); + if (resolvedValue === undefined) { + resolvedValue = await retrievePayload(storageProvider, payload.value); + cache?.set(storageKey, resolvedValue); + } + + return { + ...output, + payload: { + kind: payload.kind, + value: resolvedValue, + } as unknown as ResolvedPayload, + } satisfies ResolvedStepOutput; + } + + return output as ResolvedStepOutput; + }), + ); +}; + const eventTimeIsoStringFromEvent = (event?: IHistoryEvent) => { const { eventTime } = event ?? {}; if (!eventTime?.seconds) { @@ -130,9 +189,12 @@ const getActivityStartedDetails = ( const getFlowRunDetailedFields = async ({ workflowId, temporalClient, + storageProvider, }: { workflowId: string; temporalClient: TemporalClient; + /** Storage provider for resolving stored payload references */ + storageProvider: FileStorageProvider; }): Promise> => { const handle = temporalClient.workflow.getHandle(workflowId); @@ -224,7 +286,11 @@ const getFlowRunDetailedFields = async ({ .EVENT_TYPE_WORKFLOW_EXECUTION_FAILED, )?.workflowExecutionFailedEventAttributes?.failure?.message; - const stepMap: { [activityId: string]: StepRun } = {}; + const unresolvedStepMap: { + [activityId: string]: Omit & { + outputs?: StepRunOutput[] | null; + }; + } = {}; /** * Collect all progress signal events when building the step map, @@ -396,12 +462,14 @@ const getFlowRunDetailedFields = async ({ continue; } - if (stepMap[activityId]) { + if (unresolvedStepMap[activityId]) { // We've already encountered and therefore populated all the details for this step continue; } - const activityRecord: StepRun = { + const activityRecord: Omit & { + outputs?: StepRunOutput[] | null; + } = { stepId: activityId, stepType: activityType ?? "UNKNOWN", startedAt, @@ -415,7 +483,7 @@ const getFlowRunDetailedFields = async ({ attempt, }; - stepMap[activityId] = activityRecord; + unresolvedStepMap[activityId] = activityRecord; switch (event.eventType) { case proto.temporal.api.enums.v1.EventType @@ -523,7 +591,7 @@ const getFlowRunDetailedFields = async ({ } for (const checkpoint of checkpointLogs) { - const step = stepMap[checkpoint.stepId]; + const step = unresolvedStepMap[checkpoint.stepId]; if (!step) { throw new Error( `Could not find step with id ${checkpoint.stepId} for checkpoint with id ${checkpoint.checkpointId}`, @@ -563,7 +631,7 @@ const getFlowRunDetailedFields = async ({ for (const log of logs) { const { stepId } = log; - const activityRecord = stepMap[stepId]; + const activityRecord = unresolvedStepMap[stepId]; if (!activityRecord) { throw new Error(`No activity record found for step with id ${stepId}`); } @@ -575,7 +643,7 @@ const getFlowRunDetailedFields = async ({ const inputRequests = Object.values(inputRequestsById); for (const inputRequest of inputRequests) { if (!workflowStoppedEarly && !inputRequest.resolvedAt) { - const step = stepMap[inputRequest.stepId]; + const step = unresolvedStepMap[inputRequest.stepId]; if (!step) { throw new Error( `Could not find step with id ${inputRequest.stepId} for input request with id ${inputRequest.requestId}`, @@ -589,16 +657,93 @@ const getFlowRunDetailedFields = async ({ throw new Error("No workflow inputs found"); } - for (const step of Object.values(stepMap)) { + for (const step of Object.values(unresolvedStepMap)) { step.logs.sort((a, b) => a.recordedAt.localeCompare(b.recordedAt)); } + // Create a cache for resolved payloads to avoid re-downloading the same S3 objects + // This is shared between step outputs and workflow outputs + const payloadCache: PayloadCache = new Map(); + + // Resolve any stored payload references in step outputs + const steps: StepRun[] = await Promise.all( + Object.values(unresolvedStepMap).map(async (step) => { + // The outputs in stepMap are from Temporal history - Status<{outputs: StepOutput[]}> objects + // We need to resolve stored refs in the actual step outputs within those Status objects + if (!step.outputs) { + return step as StepRun; + } + + const resolvedOutputs = await Promise.all( + step.outputs.map(async (output) => { + // output is Status<{outputs: StepOutput[]}> + const firstContent = output.contents[0]; + if (!firstContent?.outputs) { + return output as ResolvedStepRunOutput; + } + + const resolvedInnerOutputs = await resolveStoredPayloadsInOutputs( + firstContent.outputs, + storageProvider, + payloadCache, + ); + + return { + ...output, + contents: [ + { + ...firstContent, + outputs: resolvedInnerOutputs ?? [], + }, + ], + }; + }), + ); + + return { + ...step, + outputs: resolvedOutputs, + }; + }), + ); + + // Resolve stored payload references in workflow outputs for consistency with step outputs. + // Workflow outputs may reference the same S3 locations as step outputs, so we use the same + // cache to avoid redundant downloads. + let resolvedWorkflowOutputs: ResolvedStepRunOutput[] | undefined; + if (workflowOutputs && Array.isArray(workflowOutputs)) { + resolvedWorkflowOutputs = await Promise.all( + (workflowOutputs as StepRunOutput[]).map(async (output) => { + const firstContent = output.contents[0]; + if (!firstContent?.outputs) { + return output as ResolvedStepRunOutput; + } + + const resolvedInnerOutputs = await resolveStoredPayloadsInOutputs( + firstContent.outputs, + storageProvider, + payloadCache, + ); + + return { + ...output, + contents: [ + { + ...firstContent, + outputs: resolvedInnerOutputs ?? [], + }, + ], + }; + }), + ); + } + return { failureMessage: workflowFailureMessage, inputs: workflowInputs, - outputs: workflowOutputs, + outputs: resolvedWorkflowOutputs ?? workflowOutputs, inputRequests: Object.values(inputRequestsById), - steps: Object.values(stepMap), + steps, startedAt: workflowStartedAt.toISOString(), }; }; @@ -660,11 +805,14 @@ export const getFlowRunFromTemporalWorkflowId = async (args: { /** the identifier for the Temporal workflow */ temporalWorkflowId: string; webId: WebId; + /** Storage provider for resolving stored payload references */ + storageProvider: FileStorageProvider; }): Promise => { const baseFields = await getSparseFlowRunFromTemporalWorkflowId(args); const detailedFields = await getFlowRunDetailedFields({ workflowId: args.temporalWorkflowId, temporalClient: args.temporalClient, + storageProvider: args.storageProvider, }); return { diff --git a/libs/@local/hash-backend-utils/src/flows/payload-storage.ts b/libs/@local/hash-backend-utils/src/flows/payload-storage.ts new file mode 100644 index 00000000000..f5e3955b6e8 --- /dev/null +++ b/libs/@local/hash-backend-utils/src/flows/payload-storage.ts @@ -0,0 +1,130 @@ +import type { + PayloadKindValues, + StoredPayloadKind, + StoredPayloadRef, +} from "@local/hash-isomorphic-utils/flows/types"; + +import { getAwsS3Config } from "../aws-config.js"; +import type { FileStorageProvider } from "../file-storage.js"; +import { AwsS3StorageProvider } from "../file-storage/aws-s3-storage-provider.js"; + +let _storageProvider: FileStorageProvider | undefined; + +/** + * Get a singleton instance of the S3 storage provider. + * This is shared across all activities in a worker. + */ +export const getStorageProvider = (): FileStorageProvider => { + if (!_storageProvider) { + const s3Config = getAwsS3Config(); + _storageProvider = new AwsS3StorageProvider(s3Config); + } + return _storageProvider; +}; + +export type StorePayloadParams< + K extends StoredPayloadKind, + IsArray extends boolean, +> = { + storageProvider: FileStorageProvider; + workflowId: string; + runId: string; + stepId: string; + outputName: string; + kind: K; + value: IsArray extends true ? PayloadKindValues[K][] : PayloadKindValues[K]; +}; + +/** + * Store a payload in S3 and return a typed reference to it. + * The return type is inferred based on whether `value` is an array. + * + * Used to avoid passing large payloads through Temporal activities. + * Only works with StoredPayloadKind types (ProposedEntity, ProposedEntityWithResolvedLinks, PersistedEntitiesMetadata). + */ +export const storePayload = async < + K extends StoredPayloadKind, + V extends PayloadKindValues[K] | PayloadKindValues[K][], +>(params: { + storageProvider: FileStorageProvider; + workflowId: string; + runId: string; + stepId: string; + outputName: string; + kind: K; + value: V; +}): Promise> => { + const { + storageProvider, + workflowId, + runId, + stepId, + outputName, + kind, + value, + } = params; + + const storageKey = storageProvider.getFlowOutputStorageKey({ + workflowId, + runId, + stepId, + outputName, + }); + + const isArray = Array.isArray(value); + const body = JSON.stringify(value); + + await storageProvider.uploadDirect({ + key: storageKey, + body, + contentType: "application/json", + }); + + return { + __stored: true, + kind, + storageKey, + array: isArray, + } as StoredPayloadRef; +}; + +/** + * Retrieve a payload from S3 using a stored reference. + */ +export const retrievePayload = async < + K extends StoredPayloadKind, + IsArray extends boolean, +>( + storageProvider: FileStorageProvider, + ref: StoredPayloadRef, +): Promise< + IsArray extends true ? PayloadKindValues[K][] : PayloadKindValues[K] +> => { + const buffer = await storageProvider.downloadDirect({ key: ref.storageKey }); + const data = JSON.parse(buffer.toString("utf-8")) as IsArray extends true + ? PayloadKindValues[K][] + : PayloadKindValues[K]; + + return data; +}; + +/** + * Resolve a stored payload reference to its actual value. + * The return type is inferred from the ref's `IsArray` type parameter. + * + * Only works with StoredPayloadKind types (ProposedEntity, ProposedEntityWithResolvedLinks, PersistedEntitiesMetadata). + * + * @param _kind - The payload kind, used for type inference at call sites + */ +export const resolvePayloadValue = async < + K extends StoredPayloadKind, + IsArray extends boolean, +>( + storageProvider: FileStorageProvider, + _kind: K, + ref: StoredPayloadRef, +): Promise< + IsArray extends true ? PayloadKindValues[K][] : PayloadKindValues[K] +> => { + return retrievePayload(storageProvider, ref); +}; diff --git a/libs/@local/hash-backend-utils/src/flows/process-flow-workflow.ts b/libs/@local/hash-backend-utils/src/flows/process-flow-workflow.ts index 333cfc3700c..ceb2e02bedf 100644 --- a/libs/@local/hash-backend-utils/src/flows/process-flow-workflow.ts +++ b/libs/@local/hash-backend-utils/src/flows/process-flow-workflow.ts @@ -11,6 +11,7 @@ import type { Payload, StepOutput, } from "@local/hash-isomorphic-utils/flows/types"; +import { isStoredPayloadRef } from "@local/hash-isomorphic-utils/flows/types"; import { validateFlowDefinition } from "@local/hash-isomorphic-utils/flows/util"; import { stringifyError } from "@local/hash-isomorphic-utils/stringify-error"; import type { Status } from "@local/status"; @@ -391,6 +392,21 @@ export const processFlowWorkflow = async < const arrayToParallelizeOn = inputToParallelizeOn.payload.value; + /** + * @todo H-6169: could enable this by creating an activity to retrieve the stored payloads and pass out the values, + * but we'd need to be careful that this didn't re-introduce the problem the offloaded storage is trying to solve (big outputs and inputs to Temporal activities). + * A better solution would be for activities to somehow pick items from the stored array after retrieving it, but at that point we're rethinking how Flows + * are orchestrated and we want to do so wholesale. Deferred until there's a need. + */ + if (isStoredPayloadRef(arrayToParallelizeOn)) { + processStepErrors[currentStepId] = { + code: StatusCode.Internal, + message: `Cannot parallelize on a stored payload reference for step ${currentStepId}. Stored payloads can only be resolved by activities.`, + }; + + return; + } + const newSteps = arrayToParallelizeOn.flatMap( (parallelizedValue, index) => parallelGroupStepDefinitions.map((stepDefinition) => { diff --git a/libs/@local/hash-backend-utils/src/flows/process-flow-workflow/pass-outputs-to-unprocessed-steps.ts b/libs/@local/hash-backend-utils/src/flows/process-flow-workflow/pass-outputs-to-unprocessed-steps.ts index ccfcc48d250..5bf1e41d284 100644 --- a/libs/@local/hash-backend-utils/src/flows/process-flow-workflow/pass-outputs-to-unprocessed-steps.ts +++ b/libs/@local/hash-backend-utils/src/flows/process-flow-workflow/pass-outputs-to-unprocessed-steps.ts @@ -10,6 +10,7 @@ import type { StepInputSource, StepOutput, } from "@local/hash-isomorphic-utils/flows/types"; +import { isStoredPayloadRef } from "@local/hash-isomorphic-utils/flows/types"; import type { Status } from "@local/status"; import { StatusCode } from "@local/status"; @@ -202,12 +203,21 @@ export const passOutputsToUnprocessedSteps = (params: { ({ outputName }) => outputName === aggregateOutput.stepOutputName, )!; + const existingValue = processedStep.aggregateOutput?.payload.value; + const newValue = matchingOutput.payload.value; + + /** + * @todo H-6169: enable the aggregated output of groups of parallel steps to contain stored payload references. + */ + if (isStoredPayloadRef(existingValue) || isStoredPayloadRef(newValue)) { + throw new Error( + `Cannot aggregate stored payload references. The stored payloads must be resolved before aggregation.`, + ); + } + const aggregateOutputPayload: ArrayPayload = { kind: aggregateOutput.payloadKind, - value: [ - ...(processedStep.aggregateOutput?.payload.value ?? []), - matchingOutput.payload.value, - ].flat(), + value: [...(existingValue ?? []), newValue].flat(), } as ArrayPayload; processedStep.aggregateOutput = { @@ -215,10 +225,29 @@ export const passOutputsToUnprocessedSteps = (params: { payload: aggregateOutputPayload, }; + const inputToParallelizeValue = + processedStep.inputToParallelizeOn?.payload.value; + const aggregateOutputValue = + processedStep.aggregateOutput.payload.value; + + // Stored refs should have been caught earlier in process-flow-workflow + /** + * @todo H-6169: enable the aggregated output of groups of parallel steps to contain stored payload references. + */ + if ( + isStoredPayloadRef(inputToParallelizeValue) || + isStoredPayloadRef(aggregateOutputValue) + ) { + throw new Error( + `Unexpected stored payload ref in parallelization context`, + ); + } + if ( processedStep.inputToParallelizeOn && - processedStep.inputToParallelizeOn.payload.value.length === - processedStep.aggregateOutput.payload.value.length + inputToParallelizeValue && + (inputToParallelizeValue as unknown[]).length === + (aggregateOutputValue as unknown[]).length ) { /** * If the number of items in the input that were parallelized on is diff --git a/libs/@local/hash-backend-utils/src/integrations/aviation.ts b/libs/@local/hash-backend-utils/src/integrations/aviation.ts index 4d222643120..f30ffa0559d 100644 --- a/libs/@local/hash-backend-utils/src/integrations/aviation.ts +++ b/libs/@local/hash-backend-utils/src/integrations/aviation.ts @@ -1,5 +1,6 @@ export { type AviationProposedEntity, + getHistoricalArrivalEntities, getScheduledArrivalEntities, } from "./aviation/aero-api/client.js"; export { getFlightPositionProperties } from "./aviation/flightradar24/client.js"; diff --git a/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client.ts b/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client.ts index 5dbe4c01f33..1c5921220da 100644 --- a/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client.ts +++ b/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client.ts @@ -1,19 +1,23 @@ import type { ProvidedEntityEditionProvenance } from "@blockprotocol/type-system"; +import { createRateLimitedRequester } from "../../../rate-limiter.js"; import { type BatchFlightGraphResult, buildFlightGraphBatch, } from "./client/build-graph.js"; import { generateAeroApiProvenance } from "./client/provenance.js"; import type { + AeroApiHistoricalArrivalsResponse, AeroApiScheduledArrivalsResponse, AeroApiScheduledFlight, + HistoricalArrivalsRequestParams, ScheduledArrivalsRequestParams, } from "./client/types.js"; export type { AviationProposedEntity } from "./client/build-graph.js"; export type { AeroApiAirport, + AeroApiHistoricalArrivalsResponse, AeroApiPaginationLinks, AeroApiScheduledArrivalsResponse, AeroApiScheduledFlight, @@ -21,6 +25,23 @@ export type { const baseUrl = "https://aeroapi.flightaware.com/aeroapi"; +/** + * Maximum pages to request from the API. + * The rate limit is 5 result sets/second, and each page is one result set. + */ +const DEFAULT_MAX_PAGES = 5; + +/** + * Minimum interval between requests in milliseconds. + * Throttles requests to comply with the 5 result sets/second rate limit. + */ +const REQUEST_INTERVAL_MS = 1000; + +/** + * Maximum number of retry attempts for rate limit errors. + */ +const MAX_RETRIES = 10; + const generateUrl = ( path: string, params?: Record, @@ -38,7 +59,11 @@ const generateUrl = ( return url.toString(); }; -const makeRequest = async (url: string): Promise => { +/** + * Raw fetch function for AeroAPI requests. + * Throws an error with status property for rate limit handling. + */ +async function fetchAeroApi(url: string): Promise { const apiKey = process.env.AERO_API_KEY; if (!apiKey) { @@ -53,14 +78,24 @@ const makeRequest = async (url: string): Promise => { }); if (!response.ok) { - const errorText = await response.text(); - throw new Error( - `AeroAPI error (${response.status}): ${errorText || response.statusText}`, + const error = new Error( + `AeroAPI error (${response.status}): ${(await response.text()) || response.statusText}`, ); + (error as Error & { status: number }).status = response.status; + throw error; } return (await response.json()) as T; -}; +} + +/** + * Rate-limited request function for AeroAPI. + * Uses promise chaining to ensure proper request spacing and handles 429 errors with backoff. + */ +const makeRequest = createRateLimitedRequester(fetchAeroApi, { + requestIntervalMs: REQUEST_INTERVAL_MS, + maxRetries: MAX_RETRIES, +}); /** * Retrieve a single page of scheduled arrivals for an airport. @@ -70,10 +105,10 @@ const makeRequest = async (url: string): Promise => { const getScheduledArrivals = async ( params: ScheduledArrivalsRequestParams, ): Promise => { - const { airportIcao, ...queryParams } = params; + const { airportIcao, max_pages = DEFAULT_MAX_PAGES, ...queryParams } = params; const url = generateUrl( `/airports/${airportIcao}/flights/scheduled_arrivals`, - queryParams, + { ...queryParams, max_pages }, ); return makeRequest(url); }; @@ -134,3 +169,132 @@ export const getScheduledArrivalEntities = async ( provenance, }; }; + +/** + * Retrieve a single page of historical arrivals for an airport. + * + * @see https://www.flightaware.com/aeroapi/portal/documentation#get-/history/airports/-id-/flights/arrivals + */ +const getHistoricalArrivals = async ( + params: HistoricalArrivalsRequestParams, +): Promise => { + const { airportIcao, max_pages = DEFAULT_MAX_PAGES, ...queryParams } = params; + const url = generateUrl(`/history/airports/${airportIcao}/flights/arrivals`, { + ...queryParams, + max_pages, + }); + return makeRequest(url); +}; + +/** + * Retrieve all historical arrivals for a single 24-hour period, handling pagination. + */ +const getAllHistoricalArrivals = async ( + params: Omit, +): Promise => { + const allFlights: AeroApiScheduledFlight[] = []; + + let response = await getHistoricalArrivals(params); + allFlights.push(...response.arrivals); + + while (response.links?.next) { + response = await makeRequest( + `${baseUrl}${response.links.next}`, + ); + allFlights.push(...response.arrivals); + } + + return allFlights; +}; + +/** + * Generate 24-hour time chunks for a date range. + * Each chunk uses 04:00 UTC to 03:59:59 UTC the next day to align with operational days. + */ +const generateDateChunks = ( + startDate: string, + endDate: string, +): Array<{ start: string; end: string }> => { + const chunks: Array<{ start: string; end: string }> = []; + + const startDateObj = new Date(`${startDate}T00:00:00Z`); + const endDateObj = new Date(`${endDate}T00:00:00Z`); + + const currentDate = new Date(startDateObj); + + while (currentDate <= endDateObj) { + const dateStr = currentDate.toISOString().slice(0, 10); + const nextDate = new Date(currentDate); + nextDate.setUTCDate(nextDate.getUTCDate() + 1); + const nextDateStr = nextDate.toISOString().slice(0, 10); + + chunks.push({ + start: `${dateStr}T04:00:00Z`, + end: `${nextDateStr}T03:59:59Z`, + }); + + currentDate.setUTCDate(currentDate.getUTCDate() + 1); + } + + return chunks; +}; + +/** + * Retrieve all historical arrivals for an airport over a date range. + * Automatically chunks requests into 24-hour periods due to API limitations. + * + * @param airportIcao - ICAO airport code + * @param startDate - Start date in YYYY-MM-DD format + * @param endDate - End date in YYYY-MM-DD format (inclusive) + * @returns Array of all flights across the date range + */ +const getAllHistoricalArrivalsForDateRange = async ( + airportIcao: string, + startDate: string, + endDate: string, +): Promise => { + const chunks = generateDateChunks(startDate, endDate); + const allFlights: AeroApiScheduledFlight[] = []; + + for (const chunk of chunks) { + const flights = await getAllHistoricalArrivals({ + airportIcao, + start: chunk.start, + end: chunk.end, + }); + allFlights.push(...flights); + } + + return allFlights; +}; + +/** + * Fetch historical arrivals for an airport over a date range and map them to HASH entities. + * + * @param airportIcao - ICAO airport code (e.g., "EGLL" for London Heathrow) + * @param startDate - Start date in YYYY-MM-DD format + * @param endDate - End date in YYYY-MM-DD format (inclusive, must be yesterday or earlier) + * @returns Deduplicated entities and links ready for database insertion + */ +export const getHistoricalArrivalEntities = async ( + airportIcao: string, + startDate: string, + endDate: string, +): Promise< + BatchFlightGraphResult & { + provenance: Pick; + } +> => { + const flights = await getAllHistoricalArrivalsForDateRange( + airportIcao, + startDate, + endDate, + ); + + const provenance = generateAeroApiProvenance(); + + return { + ...buildFlightGraphBatch(flights, provenance), + provenance, + }; +}; diff --git a/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client/types.ts b/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client/types.ts index 47500dd0087..6d60e50a403 100644 --- a/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client/types.ts +++ b/libs/@local/hash-backend-utils/src/integrations/aviation/aero-api/client/types.ts @@ -12,7 +12,7 @@ export type ScheduledArrivalsRequestParams = { start?: string; /** End of time range (ISO 8601) */ end?: string; - /** Maximum number of results per page (default: 15, max: 200) */ + /** Maximum number of pages to fetch (default: 1) */ max_pages?: number; /** Cursor for pagination */ cursor?: string; @@ -169,3 +169,37 @@ export type AeroApiScheduledArrivalsResponse = { /** Array of scheduled flights */ scheduled_arrivals: AeroApiScheduledFlight[]; }; + +/** + * Request parameters for the historical arrivals endpoint. + * Note: The historical endpoint has a maximum time range of 24 hours. + */ +export type HistoricalArrivalsRequestParams = { + /** ICAO airport code */ + airportIcao: string; + /** Filter by airline (ICAO code) */ + airline?: string; + /** Filter by flight type (e.g., "Airline", "General_Aviation") */ + type?: string; + /** Start of time range (ISO 8601) - required for historical queries */ + start: string; + /** End of time range (ISO 8601) - required, max 24 hours from start */ + end: string; + /** Maximum number of pages to fetch (default: 1) */ + max_pages?: number; + /** Cursor for pagination */ + cursor?: string; +}; + +/** + * Response from the historical arrivals endpoint. + * Same structure as scheduled arrivals but with `arrivals` array. + */ +export type AeroApiHistoricalArrivalsResponse = { + /** Pagination links */ + links: AeroApiPaginationLinks | null; + /** Number of pages available */ + num_pages: number; + /** Array of historical flights */ + arrivals: AeroApiScheduledFlight[]; +}; diff --git a/libs/@local/hash-backend-utils/src/integrations/aviation/flightradar24/client.ts b/libs/@local/hash-backend-utils/src/integrations/aviation/flightradar24/client.ts index 774eaece161..7d02836b559 100644 --- a/libs/@local/hash-backend-utils/src/integrations/aviation/flightradar24/client.ts +++ b/libs/@local/hash-backend-utils/src/integrations/aviation/flightradar24/client.ts @@ -1,6 +1,8 @@ import type { ProvidedEntityEditionProvenance } from "@blockprotocol/type-system"; +import { stringifyError } from "@local/hash-isomorphic-utils/stringify-error"; import type { Flight as HashFlight } from "@local/hash-isomorphic-utils/system-types/flight"; +import { createRateLimitedRequester } from "../../../rate-limiter.js"; import { mapFlight } from "./client/flight.js"; import { generateFlightradar24Provenance } from "./client/provenance.js"; import type { @@ -11,6 +13,17 @@ import type { const baseUrl = "https://fr24api.flightradar24.com/api/"; +/** + * Minimum interval between requests in milliseconds. + * Flightradar24 API rate limit is 10 queries per minute = 6 seconds per query. + */ +const REQUEST_INTERVAL_MS = 6000; + +/** + * Maximum number of retry attempts for rate limit errors. + */ +const MAX_RETRIES = 10; + const generateUrl = (path: string, params?: Record) => { const url = new URL(`${baseUrl}${path}`); @@ -31,7 +44,11 @@ const generateUrl = (path: string, params?: Record) => { return url.toString(); }; -const makeRequest = async (url: string): Promise => { +/** + * Raw fetch function for Flightradar24 API requests. + * Throws an error with status property for rate limit handling. + */ +async function fetchFlightradar24(url: string): Promise { const apiToken = process.env.FLIGHTRADAR24_API_TOKEN; if (!apiToken) { @@ -41,27 +58,46 @@ const makeRequest = async (url: string): Promise => { const response = await fetch(url, { headers: { Accept: "application/json", + "Accept-Version": "v1", Authorization: `Bearer ${apiToken}`, }, }); if (!response.ok) { - const errorData = (await response.json()) as ErrorResponse; - throw new Error( - `Flightradar24 API error: ${errorData.error.message} (code: ${errorData.error.code})`, + const errorData: unknown = await response.json(); + const error = new Error( + `Flightradar24 API error: ${stringifyError(errorData)}`, ); + (error as Error & { status: number }).status = response.status; + throw error; } - const data = (await response.json()) as T | ErrorResponse; + const data: unknown = await response.json(); - if ("error" in data) { + // Check for error response structure + if ( + typeof data === "object" && + data !== null && + "error" in data && + typeof (data as ErrorResponse).error === "object" + ) { + const errorResponse = data as ErrorResponse; throw new Error( - `Flightradar24 API error: ${data.error.message} (code: ${data.error.code})`, + `Flightradar24 API error: ${errorResponse.error.message} (code: ${errorResponse.error.code})`, ); } - return data; -}; + return data as T; +} + +/** + * Rate-limited request function for Flightradar24 API. + * Uses promise chaining to ensure proper request spacing and handles 429 errors with backoff. + */ +const makeRequest = createRateLimitedRequester(fetchFlightradar24, { + requestIntervalMs: REQUEST_INTERVAL_MS, + maxRetries: MAX_RETRIES, +}); /** * Retrieve live flight position data from Flightradar24's flight-positions/light endpoint. diff --git a/libs/@local/hash-backend-utils/src/integrations/aviation/shared/primary-keys.ts b/libs/@local/hash-backend-utils/src/integrations/aviation/shared/primary-keys.ts index 26e7ecc2d10..4bbfb97c08b 100644 --- a/libs/@local/hash-backend-utils/src/integrations/aviation/shared/primary-keys.ts +++ b/libs/@local/hash-backend-utils/src/integrations/aviation/shared/primary-keys.ts @@ -52,6 +52,17 @@ export const generatePrimaryKey = { }, }; +/** + * The properties needed to generate a primary key for each entity type. + */ +export type PrimaryKeyInput = { + [K in keyof typeof generatePrimaryKey]: { + [P in keyof Parameters<(typeof generatePrimaryKey)[K]>[0]]-?: NonNullable< + Parameters<(typeof generatePrimaryKey)[K]>[0][P] + >; + }; +}; + /** * Generates Graph API filters to find existing entities matching a proposed entity. */ diff --git a/libs/@local/hash-backend-utils/src/rate-limiter.ts b/libs/@local/hash-backend-utils/src/rate-limiter.ts new file mode 100644 index 00000000000..cb0d53867fe --- /dev/null +++ b/libs/@local/hash-backend-utils/src/rate-limiter.ts @@ -0,0 +1,91 @@ +import { backOff } from "exponential-backoff"; + +/** + * Options for configuring a rate-limited requester. + */ +export type RateLimiterOptions = { + /** Minimum interval between requests in milliseconds */ + requestIntervalMs: number; + /** Maximum number of retry attempts for rate limit errors */ + maxRetries: number; +}; + +const sleep = (ms: number): Promise => + new Promise((resolve) => { + setTimeout(resolve, ms); + }); + +/** + * Creates a rate-limited request function that ensures proper request spacing + * and handles rate limit (429) errors with exponential backoff. + * + * Uses promise chaining to prevent race conditions between concurrent requests, + * ensuring requests are spaced at least `requestIntervalMs` apart. + * + * @param requestFn - The underlying request function to wrap (must be generic) + * @param options - Rate limiter configuration + * @returns A rate-limited version of the request function that preserves type parameters + * + * @example + * ```typescript + * const rateLimitedFetch = createRateLimitedRequester( + * async (url: string): Promise => { + * const response = await fetch(url); + * if (!response.ok) { + * const error = new Error(`HTTP ${response.status}`); + * (error as Error & { status: number }).status = response.status; + * throw error; + * } + * return response.json() as T; + * }, + * { requestIntervalMs: 1000, maxRetries: 10 } + * ); + * ``` + */ +export const createRateLimitedRequester = ( + requestFn: (url: string) => Promise, + options: RateLimiterOptions, +): ((url: string) => Promise) => { + let requestQueue: Promise = Promise.resolve(); + + return async (url: string): Promise => { + // Chain this request after the previous one to ensure proper spacing + const executeRequest = async (): Promise => { + return backOff(() => requestFn(url), { + numOfAttempts: options.maxRetries, + startingDelay: 1000, + maxDelay: 30_000, + jitter: "full", + retry: (error: unknown) => { + // Retry on 429 rate limit errors + if (error && typeof error === "object" && "status" in error) { + return (error as { status: number }).status === 429; + } + // Also retry if the error message mentions rate limiting + if (error instanceof Error && error.message.includes("429")) { + return true; + } + return false; + }, + }); + }; + + // Queue this request with proper spacing between requests + const result = new Promise((resolve, reject) => { + requestQueue = requestQueue + .then(async () => { + const response = await executeRequest(); + resolve(response); + // Wait after successful request to ensure spacing + await sleep(options.requestIntervalMs); + }) + .catch(async (error: Error) => { + reject(error); + // Wait even after failed request to maintain spacing + await sleep(options.requestIntervalMs); + }); + }); + + return result; + }; +}; diff --git a/libs/@local/hash-isomorphic-utils/src/flows/action-definitions.ts b/libs/@local/hash-isomorphic-utils/src/flows/action-definitions.ts index 79933faef61..c73af3b0ffa 100644 --- a/libs/@local/hash-isomorphic-utils/src/flows/action-definitions.ts +++ b/libs/@local/hash-isomorphic-utils/src/flows/action-definitions.ts @@ -2,7 +2,8 @@ import type { InferenceModelName } from "../ai-inference-types.js"; import type { ActionDefinition, DeepReadOnly, - PayloadKindValues, + PayloadKind, + PayloadValue, StepInput, } from "./types.js"; @@ -28,6 +29,8 @@ export type AiFlowActionDefinitionId = * Activities that are registered to the 'integration' temporal task queue. */ export type IntegrationFlowActionDefinitionId = + | "getHistoricalFlightArrivals" + | "getLiveFlightPositions" | "getScheduledFlights" | "persistIntegrationEntities"; @@ -365,7 +368,7 @@ const aiFlowActionDefinitionsAsConst = { oneOfPayloadKinds: ["PersistedEntitiesMetadata"], name: "existingEntities", required: false, - array: true, + array: false, }, ], outputs: [ @@ -454,7 +457,7 @@ const aiFlowActionDefinitionsAsConst = { oneOfPayloadKinds: ["PersistedEntitiesMetadata"], name: "entities", required: false, - array: true, + array: false, }, ], outputs: [ @@ -549,6 +552,74 @@ const aiFlowActionDefinitionsAsConst = { >; const integrationFlowActionDefinitionsAsConst = { + getHistoricalFlightArrivals: { + actionDefinitionId: "getHistoricalFlightArrivals", + name: "Get Historical Flight Arrivals", + description: + "Fetch historical flight arrivals from AeroAPI for a given airport and date range.", + kind: "action", + inputs: [ + { + oneOfPayloadKinds: ["Text"], + name: "airportIcao", + description: + "The ICAO code of the airport (e.g. 'EGLL' for London Heathrow)", + required: true, + array: false, + }, + { + oneOfPayloadKinds: ["Date"], + name: "startDate", + description: + "The start date for the historical query in ISO format (e.g. '2024-01-15')", + required: true, + array: false, + }, + { + oneOfPayloadKinds: ["Date"], + name: "endDate", + description: + "The end date for the historical query in ISO format (e.g. '2024-01-16') – must be yesterday or earlier", + required: true, + array: false, + }, + ], + outputs: [ + { + payloadKind: "ProposedEntity", + name: "proposedEntities", + description: "The proposed flight entities and related data", + array: true, + required: true, + }, + ], + }, + getLiveFlightPositions: { + actionDefinitionId: "getLiveFlightPositions", + name: "Get Live Flight Positions", + description: + "Fetch live flight positions from FlightRadar24 for flights that have departed or recently arrived.", + kind: "action", + inputs: [ + { + oneOfPayloadKinds: ["PersistedEntitiesMetadata"], + name: "persistedEntities", + description: + "The persisted flight entities to check for live positions", + required: true, + array: false, + }, + ], + outputs: [ + { + payloadKind: "ProposedEntity", + name: "proposedEntities", + description: "Updated flight entities with live position data", + array: true, + required: true, + }, + ], + }, getScheduledFlights: { actionDefinitionId: "getScheduledFlights", name: "Get Scheduled Flights", @@ -680,18 +751,18 @@ type AiFlowInputPayloadType< (typeof aiFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: true; array: true } - ? PayloadKindValues[InputPayloadKindForAiFlowAction][] + ? PayloadValue, true> : Extract< (typeof aiFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: false; array: true } - ? PayloadKindValues[InputPayloadKindForAiFlowAction][] | undefined + ? PayloadValue, true> | undefined : Extract< (typeof aiFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: true; array: false } - ? PayloadKindValues[InputPayloadKindForAiFlowAction] - : PayloadKindValues[InputPayloadKindForAiFlowAction] | undefined; + ? PayloadValue, false> + : PayloadValue, false> | undefined; type SimplifiedActionInputsObject = { [N in InputNameForAiFlowAction]: AiFlowInputPayloadType; @@ -727,21 +798,21 @@ type IntegrationFlowInputPayloadType< (typeof integrationFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: true; array: true } - ? PayloadKindValues[InputPayloadKindForIntegrationFlowAction][] + ? PayloadValue, true> : Extract< (typeof integrationFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: false; array: true } ? - | PayloadKindValues[InputPayloadKindForIntegrationFlowAction][] + | PayloadValue, true> | undefined : Extract< (typeof integrationFlowActionDefinitionsAsConst)[T]["inputs"][number], { name: N } > extends { required: true; array: false } - ? PayloadKindValues[InputPayloadKindForIntegrationFlowAction] + ? PayloadValue, false> : - | PayloadKindValues[InputPayloadKindForIntegrationFlowAction] + | PayloadValue, false> | undefined; type SimplifiedIntegrationActionInputsObject< @@ -791,19 +862,17 @@ export const getSimplifiedIntegrationFlowActionInputs = < type ActionStepOutput< OutputDef extends { name: string; - payloadKind: keyof PayloadKindValues; + payloadKind: PayloadKind; array: boolean; }, > = OutputDef extends { name: infer N extends string; - payloadKind: infer K extends keyof PayloadKindValues; + payloadKind: infer K extends PayloadKind; array: infer A extends boolean; } ? { outputName: N; - payload: A extends true - ? { kind: K; value: PayloadKindValues[K][] } - : { kind: K; value: PayloadKindValues[K] }; + payload: { kind: K; value: PayloadValue }; } : never; diff --git a/libs/@local/hash-isomorphic-utils/src/flows/integration-flow-definitions.ts b/libs/@local/hash-isomorphic-utils/src/flows/integration-flow-definitions.ts index c377e1b5553..a8baf930c8d 100644 --- a/libs/@local/hash-isomorphic-utils/src/flows/integration-flow-definitions.ts +++ b/libs/@local/hash-isomorphic-utils/src/flows/integration-flow-definitions.ts @@ -7,6 +7,111 @@ import type { } from "./action-definitions.js"; import type { FlowDefinition } from "./types.js"; +/** + * Flow definition for fetching historical flight arrivals for an airport over a date range and persisting them to the graph. + */ +export const historicalFlightsFlowDefinition: FlowDefinition = + { + name: "Get Historical Flights", + type: "integration", + flowDefinitionId: "historical-flights" as EntityUuid, + groups: [ + { + groupId: 1, + description: "Retrieve and save historical flights", + }, + ], + description: + "Fetch and save historical flight arrivals for an airport over a date range.", + trigger: { + triggerDefinitionId: "userTrigger", + description: + "User provides an airport ICAO code and date range to fetch historical flights for", + kind: "trigger", + outputs: [ + { + payloadKind: "Text", + name: "Airport ICAO", + array: false, + required: true, + }, + { + payloadKind: "Date", + name: "Start Date", + array: false, + required: true, + }, + { + payloadKind: "Date", + name: "End Date", + array: false, + required: true, + }, + ], + }, + steps: [ + { + stepId: "1", + groupId: 1, + kind: "action", + actionDefinitionId: "getHistoricalFlightArrivals", + description: + "Fetch historical flight arrivals for the specified airport and date range", + inputSources: [ + { + inputName: + "airportIcao" satisfies InputNameForIntegrationFlowAction<"getHistoricalFlightArrivals">, + kind: "step-output", + sourceStepId: "trigger", + sourceStepOutputName: "Airport ICAO", + }, + { + inputName: + "startDate" satisfies InputNameForIntegrationFlowAction<"getHistoricalFlightArrivals">, + kind: "step-output", + sourceStepId: "trigger", + sourceStepOutputName: "Start Date", + }, + { + inputName: + "endDate" satisfies InputNameForIntegrationFlowAction<"getHistoricalFlightArrivals">, + kind: "step-output", + sourceStepId: "trigger", + sourceStepOutputName: "End Date", + }, + ], + }, + { + stepId: "2", + groupId: 1, + kind: "action", + description: "Save discovered entities and relationships to HASH graph", + actionDefinitionId: "persistIntegrationEntities", + inputSources: [ + { + inputName: + "proposedEntities" satisfies InputNameForIntegrationFlowAction<"persistIntegrationEntities">, + kind: "step-output", + sourceStepId: "1", + sourceStepOutputName: + "proposedEntities" satisfies OutputNameForIntegrationFlowAction<"getHistoricalFlightArrivals">, + }, + ], + }, + ], + outputs: [ + { + stepId: "2", + stepOutputName: + "persistedEntities" satisfies OutputNameForIntegrationFlowAction<"persistIntegrationEntities">, + payloadKind: "PersistedEntitiesMetadata", + name: "persistedEntities" as const, + array: false, + required: true, + }, + ], + }; + /** * Flow definition for fetching scheduled flights for an airport on a given date and persisting them to the graph. */ @@ -15,8 +120,18 @@ export const scheduledFlightsFlowDefinition: FlowDefinition, + kind: "step-output", + sourceStepId: "2", + sourceStepOutputName: + "persistedEntities" satisfies OutputNameForIntegrationFlowAction<"persistIntegrationEntities">, + }, + ], + }, + { + stepId: "4", + groupId: 2, + kind: "action", + description: "Save live flight position updates to HASH graph", + actionDefinitionId: "persistIntegrationEntities", + inputSources: [ + { + inputName: + "proposedEntities" satisfies InputNameForIntegrationFlowAction<"persistIntegrationEntities">, + kind: "step-output", + sourceStepId: "3", + sourceStepOutputName: + "proposedEntities" satisfies OutputNameForIntegrationFlowAction<"getLiveFlightPositions">, + }, + ], + }, ], outputs: [ { - stepId: "2", + stepId: "4", stepOutputName: "persistedEntities" satisfies OutputNameForIntegrationFlowAction<"persistIntegrationEntities">, payloadKind: "PersistedEntitiesMetadata", diff --git a/libs/@local/hash-isomorphic-utils/src/flows/types.ts b/libs/@local/hash-isomorphic-utils/src/flows/types.ts index 9bf7411346d..f9860c9b208 100644 --- a/libs/@local/hash-isomorphic-utils/src/flows/types.ts +++ b/libs/@local/hash-isomorphic-utils/src/flows/types.ts @@ -134,21 +134,147 @@ export type PayloadKindValues = { export type PayloadKind = keyof PayloadKindValues; +/** + * A reference to a payload that has been stored in S3. + * Used to avoid passing large payloads through Temporal activities. + * + * @template K - The payload kind being stored + * @template IsArray - Whether the stored value is an array of K values + */ +export type StoredPayloadRef< + K extends StoredPayloadKind = StoredPayloadKind, + IsArray extends boolean = boolean, +> = { + /** Discriminator to identify this as a stored reference */ + __stored: true; + /** The payload kind being stored - for type checking */ + kind: K; + /** S3 storage key */ + storageKey: string; + /** Whether the stored value is an array */ + array: IsArray; +}; + +/** + * A stored payload reference to a singular value. + */ +export type SingularStoredPayloadRef< + K extends StoredPayloadKind = StoredPayloadKind, +> = StoredPayloadRef; + +/** + * A stored payload reference to an array of values. + */ +export type ArrayStoredPayloadRef< + K extends StoredPayloadKind = StoredPayloadKind, +> = StoredPayloadRef; + +/** Type guard to check if a value is a stored payload reference */ +export const isStoredPayloadRef = ( + value: unknown, +): value is StoredPayloadRef => { + return ( + typeof value === "object" && + value !== null && + "__stored" in value && + value.__stored === true + ); +}; + +/** Type guard to check if a stored payload ref is for an array */ +export const isArrayStoredPayloadRef = ( + ref: StoredPayloadRef, +): ref is ArrayStoredPayloadRef => ref.array; + +/** Type guard to check if a stored payload ref is for a singular value */ +export const isSingularStoredPayloadRef = ( + ref: StoredPayloadRef, +): ref is SingularStoredPayloadRef => !ref.array; + +/** + * Payload kinds that are always stored in S3 due to their potential size. + * These kinds will have StoredPayloadRef as their value type in activity outputs. + */ +export const storedPayloadKinds = [ + "PersistedEntitiesMetadata", + "ProposedEntity", + "ProposedEntityWithResolvedLinks", +] as const; + +export type StoredPayloadKind = (typeof storedPayloadKinds)[number]; + +/** + * Check if a payload kind is always stored in S3. + */ +export const isStoredPayloadKind = ( + kind: PayloadKind, +): kind is StoredPayloadKind => + storedPayloadKinds.includes(kind as StoredPayloadKind); + +/** + * Payload value type used in activity outputs and inputs. + * For stored payload kinds, the value is always a StoredPayloadRef with the array-ness encoded. + * For other kinds, the value is the actual payload value (or array of values). + */ +export type PayloadValue< + K extends PayloadKind, + IsArray extends boolean, +> = K extends StoredPayloadKind + ? StoredPayloadRef + : IsArray extends true + ? PayloadKindValues[K][] + : PayloadKindValues[K]; + +/** + * Singular payload types for all payload kinds. + * For stored payload kinds, the value is a SingularStoredPayloadRef. + */ export type SingularPayload = { + [K in keyof PayloadKindValues]: K extends StoredPayloadKind + ? { kind: K; value: SingularStoredPayloadRef } + : { kind: K; value: PayloadKindValues[K] }; +}[keyof PayloadKindValues]; + +/** + * Array payload types for all payload kinds. + * For stored payload kinds, the value is an ArrayStoredPayloadRef (which represents the stored array). + */ +export type ArrayPayload = { + [K in keyof PayloadKindValues]: K extends StoredPayloadKind + ? { kind: K; value: ArrayStoredPayloadRef } + : { kind: K; value: PayloadKindValues[K][] }; +}[keyof PayloadKindValues]; + +/** + * General payload type used throughout the flow system. + * For stored payload kinds (ProposedEntity, ProposedEntityWithResolvedLinks, PersistedEntitiesMetadata), + * the value may be a StoredPayloadRef that activities will resolve. + */ +export type Payload = SingularPayload | ArrayPayload; + +/** + * Resolved payload types - used after stored refs have been resolved (e.g., in GraphQL responses). + * These contain actual values instead of StoredPayloadRef for stored payload kinds. + */ +export type ResolvedSingularPayload = { [K in keyof PayloadKindValues]: { kind: K; value: PayloadKindValues[K]; }; }[keyof PayloadKindValues]; -export type ArrayPayload = { +export type ResolvedArrayPayload = { [K in keyof PayloadKindValues]: { kind: K; value: PayloadKindValues[K][]; }; }[keyof PayloadKindValues]; -export type Payload = SingularPayload | ArrayPayload; +/** + * Payload type after stored refs have been resolved. + * Used in frontend/GraphQL contexts where the backend has already resolved StoredPayloadRefs. + */ +export type ResolvedPayload = ResolvedSingularPayload | ResolvedArrayPayload; /** * Step Definition @@ -346,10 +472,26 @@ export type StepOutput

= { payload: P; }; +/** + * StepOutput with resolved payload - used in frontend/GraphQL contexts + * where stored refs have been resolved by the backend. + */ +export type ResolvedStepOutput = { + outputName: string; + payload: ResolvedPayload; +}; + export type StepRunOutput = Status< Required, "outputs">> >; +/** + * StepRunOutput with resolved payloads - used in frontend/GraphQL contexts. + */ +export type ResolvedStepRunOutput = Status<{ + outputs: ResolvedStepOutput[]; +}>; + export type ActionStep< ActionDefinitionId extends FlowActionDefinitionId = FlowActionDefinitionId, > = { diff --git a/libs/@local/hash-isomorphic-utils/src/graphql/scalar-mapping.ts b/libs/@local/hash-isomorphic-utils/src/graphql/scalar-mapping.ts index 8636551e56a..df090bcf34d 100644 --- a/libs/@local/hash-isomorphic-utils/src/graphql/scalar-mapping.ts +++ b/libs/@local/hash-isomorphic-utils/src/graphql/scalar-mapping.ts @@ -148,7 +148,8 @@ export const scalars = { ExternalInputResponseWithoutUser: "@local/hash-isomorphic-utils/flows/types#ExternalInputResponseWithoutUser", StepInput: "@local/hash-isomorphic-utils/flows/types#StepInput", - StepRunOutput: "@local/hash-isomorphic-utils/flows/types#StepRunOutput", + ResolvedStepRunOutput: + "@local/hash-isomorphic-utils/flows/types#ResolvedStepRunOutput", StepProgressLog: "@local/hash-isomorphic-utils/flows/types#StepProgressLog", RoleAssignmentStatus: "@local/hash-graph-client#RoleAssignmentStatus", diff --git a/libs/@local/hash-isomorphic-utils/src/graphql/type-defs/knowledge/flow.typedef.ts b/libs/@local/hash-isomorphic-utils/src/graphql/type-defs/knowledge/flow.typedef.ts index fd169d6a57c..1947e402e17 100644 --- a/libs/@local/hash-isomorphic-utils/src/graphql/type-defs/knowledge/flow.typedef.ts +++ b/libs/@local/hash-isomorphic-utils/src/graphql/type-defs/knowledge/flow.typedef.ts @@ -52,7 +52,7 @@ export const flowTypedef = gql` scalar ExternalInputRequest scalar FlowInputs scalar StepInput - scalar StepRunOutput + scalar ResolvedStepRunOutput scalar StepProgressLog # FlowActionDefinitionId is just here so that the type is generated along with the other scalars, # as we need to pass it to FlowDefinition. @@ -122,7 +122,7 @@ export const flowTypedef = gql` """ Outputs of the step """ - outputs: [StepRunOutput!] + outputs: [ResolvedStepRunOutput!] } type FlowRun { @@ -176,7 +176,7 @@ export const flowTypedef = gql` """ Outputs of the flow run """ - outputs: [StepRunOutput!] + outputs: [ResolvedStepRunOutput!] """ Any requests for external input made by steps within the Flow """ diff --git a/tests/hash-backend-integration/src/tests/graph/knowledge/system-types/file.test.ts b/tests/hash-backend-integration/src/tests/graph/knowledge/system-types/file.test.ts index 1c00c1f8063..9715e40c7b7 100644 --- a/tests/hash-backend-integration/src/tests/graph/knowledge/system-types/file.test.ts +++ b/tests/hash-backend-integration/src/tests/graph/knowledge/system-types/file.test.ts @@ -5,7 +5,12 @@ import { createFileFromUploadRequest, } from "@apps/hash-api/src/graph/knowledge/system-types/file"; import type { User } from "@apps/hash-api/src/graph/knowledge/system-types/user"; -import type { EntityId, Timestamp, WebId } from "@blockprotocol/type-system"; +import type { + EntityId, + Timestamp, + Url, + WebId, +} from "@blockprotocol/type-system"; import { Logger } from "@local/hash-backend-utils/logger"; import { beforeAll, describe, expect, it, vi } from "vitest"; @@ -47,11 +52,16 @@ describe("File", () => { const entityId = "abc~123" as EntityId; const editionIdentifier = "ed123" as Timestamp; const fileKey = `${entityId}/${editionIdentifier}/mock-test-key` as const; - const downloadUrl = "mock-download-url"; + const downloadUrl = "mock-download-url" as Url; const uploadUrl = "mock-upload-url"; graphContext.uploadProvider = { getFileEntityStorageKey: vi.fn(() => fileKey), + getFlowOutputStorageKey: vi.fn(() => "mock-flow-output-key"), + uploadDirect: vi.fn(() => Promise.resolve()), + downloadDirect: vi.fn(() => + Promise.resolve(Buffer.from("mock-download-body")), + ), presignDownload: vi.fn(() => Promise.resolve(downloadUrl)), presignUpload: vi.fn(() => Promise.resolve({ diff --git a/tests/hash-backend-integration/src/tests/util.ts b/tests/hash-backend-integration/src/tests/util.ts index e53baccc02a..300ee42de0a 100644 --- a/tests/hash-backend-integration/src/tests/util.ts +++ b/tests/hash-backend-integration/src/tests/util.ts @@ -54,6 +54,21 @@ export const createTestImpureGraphContext = (): ImpureGraphContext< }, }, uploadProvider: { + getFlowOutputStorageKey: () => { + throw new Error( + "File fetching not implemented in tests. Override with mock to test.", + ); + }, + downloadDirect: () => { + throw new Error( + "File fetching not implemented in tests. Override with mock to test.", + ); + }, + uploadDirect: () => { + throw new Error( + "File fetching not implemented in tests. Override with mock to test.", + ); + }, getFileEntityStorageKey: (_params) => { throw new Error( "File fetching not implemented in tests. Override with mock to test.", diff --git a/yarn.lock b/yarn.lock index 7c95bb4fd60..32c40cbb39e 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10323,6 +10323,7 @@ __metadata: "@vitest/coverage-istanbul": "npm:3.2.4" agentkeepalive: "npm:4.6.0" axios: "npm:1.12.2" + cache-manager: "npm:5.7.6" dotenv-flow: "npm:3.3.0" eslint: "npm:9.39.2" exponential-backoff: "npm:3.1.3"