From 2f10d834adafdddd011758e0a9f132e2f902c2ba Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 19:56:22 -0400 Subject: [PATCH 01/18] feat: add Qwen Image 2512 txt2img support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shares the QwenImageEdit base type and infrastructure with the edit model. Key changes: - Text encoder: auto-selects prompt template based on reference images — edit template (drop_idx=64) when images present, generate template (drop_idx=34) when absent - Denoise: detects zero_cond_t to determine whether to concatenate reference latents; txt2img models pass only noisy patches with a single-entry img_shapes - Model config: accept QwenImagePipeline in addition to QwenImageEditPlusPipeline - LoRA: handle "transformer." key prefix from some training frameworks, add to config detection - Starter models: Qwen-Image-2512 full + 4 GGUF variants + Lightning V2.0 LoRAs (4-step, 8-step), all added to the Qwen Image Edit bundle Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/invocations/qwen_image_denoise.py | 63 +++++++++++------ .../invocations/qwen_image_text_encoder.py | 47 +++++++++---- .../backend/model_manager/starter_models.py | 67 +++++++++++++++++++ 3 files changed, 142 insertions(+), 35 deletions(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index cd3ff917596..4b9fb207680 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -353,29 +353,44 @@ def _run_diffusion(self, context: InvocationContext): # Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4) latents = self._pack_latents(latents, 1, out_channels, latent_height, latent_width) - # Pack reference image latents and concatenate along the sequence dimension. - # The edit transformer always expects [noisy_patches ; ref_patches] in its sequence. - if ref_latents is not None: - _, ref_ch, rh, rw = ref_latents.shape - if rh != latent_height or rw != latent_width: - ref_latents = torch.nn.functional.interpolate( - ref_latents, size=(latent_height, latent_width), mode="bilinear" + # Determine whether the model uses reference latent conditioning (zero_cond_t). + # Edit models (zero_cond_t=True) expect [noisy_patches ; ref_patches] in the sequence. + # Txt2img models (zero_cond_t=False) only take noisy patches. + has_zero_cond_t = getattr(transformer_info.model, "zero_cond_t", False) or getattr( + transformer_info.model.config, "zero_cond_t", False + ) + use_ref_latents = has_zero_cond_t + + ref_latents_packed = None + if use_ref_latents: + if ref_latents is not None: + _, ref_ch, rh, rw = ref_latents.shape + if rh != latent_height or rw != latent_width: + ref_latents = torch.nn.functional.interpolate( + ref_latents, size=(latent_height, latent_width), mode="bilinear" + ) + else: + # No reference image provided — use zeros so the model still gets the + # expected sequence layout. + ref_latents = torch.zeros( + 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype ) + ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) + + # img_shapes tells the transformer the spatial layout of patches. + if use_ref_latents: + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + (1, latent_height // 2, latent_width // 2), + ] + ] else: - # No reference image provided — use zeros so the model still gets the - # expected sequence layout. - ref_latents = torch.zeros( - 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype - ) - ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) - - # img_shapes tells the transformer the spatial layout of noisy and reference patches. - img_shapes = [ - [ - (1, latent_height // 2, latent_width // 2), - (1, latent_height // 2, latent_width // 2), + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + ] ] - ] # Prepare inpaint extension (operates in 4D space, so unpack/repack around it) inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape @@ -428,8 +443,12 @@ def _run_diffusion(self, context: InvocationContext): # The pipeline passes timestep / 1000 to the transformer timestep = t.expand(latents.shape[0]).to(inference_dtype) - # Concatenate noisy and reference patches along the sequence dim - model_input = torch.cat([latents, ref_latents_packed], dim=1) + # For edit models: concatenate noisy and reference patches along the sequence dim + # For txt2img models: just use noisy patches + if ref_latents_packed is not None: + model_input = torch.cat([latents, ref_latents_packed], dim=1) + else: + model_input = latents noise_pred_cond = transformer( hidden_states=model_input, diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index 641e8c4d388..74670735877 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -20,26 +20,44 @@ QwenImageConditioningInfo, ) -# The Qwen Image Edit pipeline uses a specific system prompt and drops the first -# N tokens (the system prompt prefix) from the embeddings. These constants are -# taken directly from the diffusers QwenImagePipeline. -_SYSTEM_PROMPT = ( +# Prompt templates and drop indices for the two Qwen Image model modes. +# These are taken directly from the diffusers pipelines. + +# Image editing mode (QwenImagePipeline) +_EDIT_SYSTEM_PROMPT = ( "Describe the key features of the input image (color, shape, size, texture, objects, background), " "then explain how the user's text instruction should alter or modify the image. " "Generate a new image that meets the user's requirements while maintaining consistency " "with the original input where appropriate." ) +_EDIT_DROP_IDX = 64 + +# Text-to-image mode (QwenImagePipeline) +_GENERATE_SYSTEM_PROMPT = ( + "Describe the image by detailing the color, shape, size, texture, quantity, " + "text, spatial relationships of the objects and background:" +) +_GENERATE_DROP_IDX = 34 + _IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" -_DROP_IDX = 64 def _build_prompt(user_prompt: str, num_images: int) -> str: - """Build the full prompt with one vision placeholder per reference image.""" - image_tokens = _IMAGE_PLACEHOLDER * max(num_images, 1) - return ( - f"<|im_start|>system\n{_SYSTEM_PROMPT}<|im_end|>\n" - f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" - "<|im_start|>assistant\n" + """Build the full prompt with the appropriate template based on whether reference images are provided.""" + if num_images > 0: + # Edit mode: include vision placeholders for reference images + image_tokens = _IMAGE_PLACEHOLDER * num_images + return ( + f"<|im_start|>system\n{_EDIT_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + else: + # Generate mode: text-only prompt + return ( + f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" ) @@ -188,7 +206,10 @@ def _encode( hidden_states = outputs.hidden_states[-1] # Extract valid (non-padding) tokens using the attention mask, - # then drop the first _DROP_IDX tokens (system prompt prefix). + # then drop the system prompt prefix tokens. + # The drop index differs between edit mode (64) and generate mode (34). + drop_idx = _EDIT_DROP_IDX if images else _GENERATE_DROP_IDX + attn_mask = model_inputs.attention_mask bool_mask = attn_mask.bool() valid_lengths = bool_mask.sum(dim=1) @@ -196,7 +217,7 @@ def _encode( split_hidden = torch.split(selected, valid_lengths.tolist(), dim=0) # Drop system prefix tokens and build padded output - trimmed = [h[_DROP_IDX:] for h in split_hidden] + trimmed = [h[drop_idx:] for h in split_hidden] attn_mask_list = [torch.ones(h.size(0), dtype=torch.long, device=device) for h in trimmed] max_seq_len = max(h.size(0) for h in trimmed) diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index de5f1e1b8b6..d049a52eee7 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -711,6 +711,69 @@ class StarterModelBundle(BaseModel): "Settings: Steps=8, CFG=1, Shift Override=3.", type=ModelType.LoRA, ) + +# Qwen Image (txt2img) +qwen_image = StarterModel( + name="Qwen Image 2512", + base=BaseModelType.QwenImage, + source="Qwen/Qwen-Image-2512", + description="Qwen Image 2512 full diffusers model. High-quality text-to-image generation. (~40GB)", + type=ModelType.Main, +) + +qwen_image_gguf_q4_k_m = StarterModel( + name="Qwen Image 2512 (Q4_K_M)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q4_K_M.gguf", + description="Qwen Image 2512 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q2_k = StarterModel( + name="Qwen Image 2512 (Q2_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q2_K.gguf", + description="Qwen Image 2512 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q6_k = StarterModel( + name="Qwen Image 2512 (Q6_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q6_K.gguf", + description="Qwen Image 2512 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q8_0 = StarterModel( + name="Qwen Image 2512 (Q8_0)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q8_0.gguf", + description="Qwen Image 2512 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_lightning_4step = StarterModel( + name="Qwen Image Lightning (4-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in just 4 steps. " + "Settings: Steps=4, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) + +qwen_image_lightning_8step = StarterModel( + name="Qwen Image Lightning (8-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in 8 steps with better quality. " + "Settings: Steps=8, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) # endregion # region SigLIP @@ -1102,6 +1165,10 @@ class StarterModelBundle(BaseModel): qwen_image_gguf_q8_0, qwen_image_lightning_4step, qwen_image_lightning_8step, + qwen_image, + qwen_image_gguf_q4_k_m, + qwen_image_lightning_4step, + qwen_image_lightning_8step, ] STARTER_BUNDLES: dict[str, StarterModelBundle] = { From 8b9e36f05aad8035f0a6c52f146ec37219d97dc7 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 22:57:03 -0400 Subject: [PATCH 02/18] chore: ruff & lint:prettier --- .../invocations/qwen_image_text_encoder.py | 2 +- .../model_records/model_records_base.py | 10 +++++-- invokeai/backend/model_manager/taxonomy.py | 26 ++++++++++++++++--- .../controlLayers/hooks/addLayerHooks.ts | 6 +---- .../Advanced/ParamQwenImageQuantization.tsx | 5 +--- 5 files changed, 34 insertions(+), 15 deletions(-) diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index 74670735877..9e3f5723ba5 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -58,7 +58,7 @@ def _build_prompt(user_prompt: str, num_images: int) -> str: f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n" f"<|im_start|>user\n{user_prompt}<|im_end|>\n" "<|im_start|>assistant\n" - ) + ) @invocation( diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py index ea5b9ef7546..dcdc0ce5956 100644 --- a/invokeai/app/services/model_records/model_records_base.py +++ b/invokeai/app/services/model_records/model_records_base.py @@ -25,8 +25,8 @@ ModelSourceType, ModelType, ModelVariantType, - QwenImageVariantType, Qwen3VariantType, + QwenImageVariantType, SchedulerPredictionType, ZImageVariantType, ) @@ -95,7 +95,13 @@ class ModelRecordChanges(BaseModelExcludeNull): # Checkpoint-specific changes # TODO(MM2): Should we expose these? Feels footgun-y... variant: Optional[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType ] = Field(description="The variant of the model.", default=None) prediction_type: Optional[SchedulerPredictionType] = Field( description="The prediction type of the model.", default=None diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py index 9250310a29a..587c0b0625f 100644 --- a/invokeai/backend/model_manager/taxonomy.py +++ b/invokeai/backend/model_manager/taxonomy.py @@ -225,8 +225,28 @@ class FluxLoRAFormat(str, Enum): AnyVariant: TypeAlias = Union[ - ModelVariantType, ClipVariantType, FluxVariantType, Flux2VariantType, ZImageVariantType, QwenImageVariantType, Qwen3VariantType + ModelVariantType, + ClipVariantType, + FluxVariantType, + Flux2VariantType, + ZImageVariantType, + QwenImageVariantType, + Qwen3VariantType, ] variant_type_adapter = TypeAdapter[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType -](ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType) + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +]( + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +) diff --git a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts index 3cd28b5f2a0..2027ff41741 100644 --- a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts +++ b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts @@ -80,11 +80,7 @@ export const selectDefaultControlAdapter = createSelector( export const getDefaultRefImageConfig = ( getState: AppGetState -): - | IPAdapterConfig - | FluxKontextReferenceImageConfig - | Flux2ReferenceImageConfig - | QwenImageReferenceImageConfig => { +): IPAdapterConfig | FluxKontextReferenceImageConfig | Flux2ReferenceImageConfig | QwenImageReferenceImageConfig => { const state = getState(); const mainModelConfig = selectMainModelConfig(state); diff --git a/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamQwenImageQuantization.tsx b/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamQwenImageQuantization.tsx index 46025d95867..3d086e6ec4a 100644 --- a/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamQwenImageQuantization.tsx +++ b/invokeai/frontend/web/src/features/parameters/components/Advanced/ParamQwenImageQuantization.tsx @@ -1,10 +1,7 @@ import type { ComboboxOnChange, ComboboxOption } from '@invoke-ai/ui-library'; import { Combobox, FormControl, FormLabel } from '@invoke-ai/ui-library'; import { useAppDispatch, useAppSelector } from 'app/store/storeHooks'; -import { - qwenImageQuantizationChanged, - selectQwenImageQuantization, -} from 'features/controlLayers/store/paramsSlice'; +import { qwenImageQuantizationChanged, selectQwenImageQuantization } from 'features/controlLayers/store/paramsSlice'; import { memo, useCallback, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; From 25b45ca7582a2ac80c90709a6ead381d86ffe125 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 23:07:53 -0400 Subject: [PATCH 03/18] fix: remove unused frontend exports (zQwenImageVariantType, isQwenImageEditMainModelConfig) Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/frontend/web/src/features/nodes/types/common.ts | 2 +- invokeai/frontend/web/src/services/api/types.ts | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts index ca1d42c5a44..10afd6e44bb 100644 --- a/invokeai/frontend/web/src/features/nodes/types/common.ts +++ b/invokeai/frontend/web/src/features/nodes/types/common.ts @@ -153,7 +153,7 @@ export const zModelVariantType = z.enum(['normal', 'inpaint', 'depth']); export const zFluxVariantType = z.enum(['dev', 'dev_fill', 'schnell']); export const zFlux2VariantType = z.enum(['klein_4b', 'klein_9b', 'klein_9b_base']); export const zZImageVariantType = z.enum(['turbo', 'zbase']); -export const zQwenImageVariantType = z.enum(['generate', 'edit']); +const zQwenImageVariantType = z.enum(['generate', 'edit']); export const zQwen3VariantType = z.enum(['qwen3_4b', 'qwen3_8b']); export const zAnyModelVariant = z.union([ zModelVariantType, diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index cfeb672d95e..c8aeda6c760 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -330,9 +330,6 @@ export const isQwenImageDiffusersMainModelConfig = (config: AnyModelConfig): con return config.type === 'main' && config.base === 'qwen-image' && config.format === 'diffusers'; }; -export const isQwenImageEditMainModelConfig = (config: AnyModelConfig): config is MainModelConfig => { - return config.type === 'main' && config.base === 'qwen-image' && 'variant' in config && config.variant === 'edit'; -}; export const isTIModelConfig = (config: AnyModelConfig): config is MainModelConfig => { return config.type === 'embedding'; From 66e9f873c5cc9d7836ec48ca741ea540b658ee74 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 23:17:27 -0400 Subject: [PATCH 04/18] fix: make QwenImage variant optional to fix model detection tags The variant field with a default value was appended to the discriminator tag (e.g. main.gguf_quantized.qwen-image.generate), breaking model detection for GGUF and Diffusers models. Making variant optional with default=None restores the correct tags (main.gguf_quantized.qwen-image). The variant is still set during Diffusers model probing via _get_qwen_image_variant() and can be manually set for GGUF models. Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/backend/model_manager/configs/main.py | 4 ++-- .../src/features/parameters/components/Prompts/Prompts.tsx | 7 +++++-- invokeai/frontend/web/src/services/api/schema.ts | 6 ++---- invokeai/frontend/web/src/services/api/types.ts | 1 - 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/invokeai/backend/model_manager/configs/main.py b/invokeai/backend/model_manager/configs/main.py index 484a95f4bb8..6ec0611fdf3 100644 --- a/invokeai/backend/model_manager/configs/main.py +++ b/invokeai/backend/model_manager/configs/main.py @@ -1208,7 +1208,7 @@ class Main_Diffusers_QwenImage_Config(Diffusers_Config_Base, Main_Config_Base, C """Model config for Qwen Image diffusers models (both txt2img and edit).""" base: Literal[BaseModelType.QwenImage] = Field(BaseModelType.QwenImage) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: @@ -1269,7 +1269,7 @@ class Main_GGUF_QwenImage_Config(Checkpoint_Config_Base, Main_Config_Base, Confi base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage) format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: diff --git a/invokeai/frontend/web/src/features/parameters/components/Prompts/Prompts.tsx b/invokeai/frontend/web/src/features/parameters/components/Prompts/Prompts.tsx index 18f5c4c4dd8..c93841d77b7 100644 --- a/invokeai/frontend/web/src/features/parameters/components/Prompts/Prompts.tsx +++ b/invokeai/frontend/web/src/features/parameters/components/Prompts/Prompts.tsx @@ -22,8 +22,11 @@ export const Prompts = memo(() => { if (!modelSupportsRefImages) { return false; } - if (modelConfig?.base === 'qwen-image' && 'variant' in modelConfig && modelConfig.variant !== 'edit') { - return false; + if (modelConfig?.base === 'qwen-image') { + const variant = 'variant' in modelConfig ? modelConfig.variant : null; + if (variant !== 'edit') { + return false; + } } return true; }, [modelSupportsRefImages, modelConfig]); diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index a23217c3a81..2a8a3d243b7 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -18500,8 +18500,7 @@ export type components = { * @constant */ base: "qwen-image"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** Main_Diffusers_SD1_Config */ Main_Diffusers_SD1_Config: { @@ -19234,8 +19233,7 @@ export type components = { * @constant */ format: "gguf_quantized"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** * Main_GGUF_ZImage_Config diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index c8aeda6c760..b447f9debbe 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -330,7 +330,6 @@ export const isQwenImageDiffusersMainModelConfig = (config: AnyModelConfig): con return config.type === 'main' && config.base === 'qwen-image' && config.format === 'diffusers'; }; - export const isTIModelConfig = (config: AnyModelConfig): config is MainModelConfig => { return config.type === 'embedding'; }; From 556db02c45e12ff5f14a58f34673aeedea45bddd Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 23:33:18 -0400 Subject: [PATCH 05/18] fix: restore Qwen Image Edit starter models with distinct variable names The rename from qwen_image_edit -> qwen_image caused variable name collisions with the txt2img starter models. Give edit models the qwen_image_edit_* prefix to distinguish from qwen_image_* (txt2img). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/model_manager/starter_models.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index d049a52eee7..ca0076cbadd 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -650,7 +650,7 @@ class StarterModelBundle(BaseModel): # endregion # region Qwen Image Edit -qwen_image = StarterModel( +qwen_image_edit = StarterModel( name="Qwen Image Edit 2511", base=BaseModelType.QwenImage, source="Qwen/Qwen-Image-Edit-2511", @@ -658,7 +658,7 @@ class StarterModelBundle(BaseModel): type=ModelType.Main, ) -qwen_image_gguf_q4_k_m = StarterModel( +qwen_image_edit_gguf_q4_k_m = StarterModel( name="Qwen Image Edit 2511 (Q4_K_M)", base=BaseModelType.QwenImage, source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q4_K_M.gguf", @@ -667,7 +667,7 @@ class StarterModelBundle(BaseModel): format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q2_k = StarterModel( +qwen_image_edit_gguf_q2_k = StarterModel( name="Qwen Image Edit 2511 (Q2_K)", base=BaseModelType.QwenImage, source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q2_K.gguf", @@ -676,7 +676,7 @@ class StarterModelBundle(BaseModel): format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q6_k = StarterModel( +qwen_image_edit_gguf_q6_k = StarterModel( name="Qwen Image Edit 2511 (Q6_K)", base=BaseModelType.QwenImage, source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q6_K.gguf", @@ -685,7 +685,7 @@ class StarterModelBundle(BaseModel): format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q8_0 = StarterModel( +qwen_image_edit_gguf_q8_0 = StarterModel( name="Qwen Image Edit 2511 (Q8_0)", base=BaseModelType.QwenImage, source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q8_0.gguf", @@ -694,7 +694,7 @@ class StarterModelBundle(BaseModel): format=ModelFormat.GGUFQuantized, ) -qwen_image_lightning_4step = StarterModel( +qwen_image_edit_lightning_4step = StarterModel( name="Qwen Image Edit Lightning (4-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors", @@ -703,7 +703,7 @@ class StarterModelBundle(BaseModel): type=ModelType.LoRA, ) -qwen_image_lightning_8step = StarterModel( +qwen_image_edit_lightning_8step = StarterModel( name="Qwen Image Edit Lightning (8-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-8steps-V1.0-bf16.safetensors", @@ -1075,6 +1075,13 @@ class StarterModelBundle(BaseModel): flux2_klein_qwen3_4b_encoder, flux2_klein_qwen3_8b_encoder, cogview4, + qwen_image_edit, + qwen_image_edit_gguf_q2_k, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q6_k, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q2_k, qwen_image_gguf_q4_k_m, @@ -1160,11 +1167,11 @@ class StarterModelBundle(BaseModel): ] qwen_image_bundle: list[StarterModel] = [ - qwen_image, - qwen_image_gguf_q4_k_m, - qwen_image_gguf_q8_0, - qwen_image_lightning_4step, - qwen_image_lightning_8step, + qwen_image_edit, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q4_k_m, qwen_image_lightning_4step, From f3dfbd5d4473c54583cc29a53d73f67b0b39f995 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Fri, 27 Mar 2026 23:53:09 -0400 Subject: [PATCH 06/18] fix: restore correct GGUF filenames in Qwen Image Edit starter model URLs The global rename sed changed 'qwen-image-edit-2511' to 'qwen-image-2511' inside the HuggingFace URLs, but the actual files on HF still have 'edit' in their names. Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/backend/model_manager/starter_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index ca0076cbadd..ef7b25431a0 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -661,7 +661,7 @@ class StarterModelBundle(BaseModel): qwen_image_edit_gguf_q4_k_m = StarterModel( name="Qwen Image Edit 2511 (Q4_K_M)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q4_K_M.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q4_K_M.gguf", description="Qwen Image Edit 2511 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, @@ -670,7 +670,7 @@ class StarterModelBundle(BaseModel): qwen_image_edit_gguf_q2_k = StarterModel( name="Qwen Image Edit 2511 (Q2_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q2_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q2_K.gguf", description="Qwen Image Edit 2511 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, @@ -679,7 +679,7 @@ class StarterModelBundle(BaseModel): qwen_image_edit_gguf_q6_k = StarterModel( name="Qwen Image Edit 2511 (Q6_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q6_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q6_K.gguf", description="Qwen Image Edit 2511 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, @@ -688,7 +688,7 @@ class StarterModelBundle(BaseModel): qwen_image_edit_gguf_q8_0 = StarterModel( name="Qwen Image Edit 2511 (Q8_0)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q8_0.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q8_0.gguf", description="Qwen Image Edit 2511 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, From 6a19ad57f3a49c4c26328fcca6b9d75f292c01e4 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 00:36:28 -0400 Subject: [PATCH 07/18] fix: skip reference images in graph for non-edit Qwen Image models When switching from an edit model to a generate model, reference images remain in state but the panel is hidden. Prevent them from being passed to the text encoder and VAE encoder by checking the model variant. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../graph/generation/buildQwenImageGraph.ts | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index e7c04744d4e..28a9b253485 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -103,14 +103,18 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise - entity.isEnabled && - isQwenImageReferenceImageConfig(entity.config) && - entity.config.image !== null && - getGlobalReferenceImageWarnings(entity, model).length === 0 - ); + // Only collect reference images for edit-variant models. + // For txt2img (generate) models, reference images are not used even if they exist in state. + const isEditModel = 'variant' in model && model.variant === 'edit'; + const validRefImageConfigs = isEditModel + ? selectRefImagesSlice(state).entities.filter( + (entity) => + entity.isEnabled && + isQwenImageReferenceImageConfig(entity.config) && + entity.config.image !== null && + getGlobalReferenceImageWarnings(entity, model).length === 0 + ) + : []; if (validRefImageConfigs.length > 0) { const refImgCollect = g.addNode({ From 058df877c039b4ef09d9e96e7aafb2c58aae9576 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 00:41:10 -0400 Subject: [PATCH 08/18] fix: only set zero_cond_t=True for edit-variant GGUF models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The txt2img model doesn't use zero_cond_t — setting it causes the transformer to double the timestep batch and create modulation indices for non-existent reference patches, producing noise output. Now checks the config variant before enabling it. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../model_manager/load/model_loaders/qwen_image.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py index 15fcedba166..a025e727945 100644 --- a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py +++ b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py @@ -15,6 +15,7 @@ BaseModelType, ModelFormat, ModelType, + QwenImageVariantType, SubModelType, ) from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor @@ -160,10 +161,13 @@ def _load_from_singlefile(self, config: AnyModelConfig) -> AnyModel: "axes_dims_rope": (16, 56, 56), } - # zero_cond_t was added in diffusers 0.37+; skip it on older versions + # zero_cond_t is only used by edit-variant models. It enables dual modulation + # for noisy vs reference patches. Setting it on txt2img models produces garbage. + # Also requires diffusers 0.37+ (the parameter doesn't exist in older versions). import inspect - if "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: + is_edit = getattr(config, "variant", None) == QwenImageVariantType.Edit + if is_edit and "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: model_config["zero_cond_t"] = True with accelerate.init_empty_weights(): From b41bee72bcb6836b01a50854de8e3fa437d9e39e Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 01:04:28 -0400 Subject: [PATCH 09/18] fix: recall Qwen Image advanced params (component source, quantization, shift) - Save qwen_image_component_source, qwen_image_quantization, and qwen_image_shift in generation metadata - Add metadata recall handlers so remix/recall restores these settings Co-Authored-By: Claude Opus 4.6 (1M context) --- .../web/src/features/metadata/parsing.tsx | 83 +++++++++++++++++++ .../graph/generation/buildQwenImageGraph.ts | 3 + 2 files changed, 86 insertions(+) diff --git a/invokeai/frontend/web/src/features/metadata/parsing.tsx b/invokeai/frontend/web/src/features/metadata/parsing.tsx index 7d1d511a3c2..58f3aaab07d 100644 --- a/invokeai/frontend/web/src/features/metadata/parsing.tsx +++ b/invokeai/frontend/web/src/features/metadata/parsing.tsx @@ -39,6 +39,9 @@ import { setZImageSeedVarianceEnabled, setZImageSeedVarianceRandomizePercent, setZImageSeedVarianceStrength, + qwenImageComponentSourceSelected, + qwenImageQuantizationChanged, + qwenImageShiftChanged, vaeSelected, widthChanged, zImageQwen3EncoderModelSelected, @@ -677,6 +680,83 @@ const ZImageSeedVarianceRandomizePercent: SingleMetadataHandler = { }; //#endregion ZImageSeedVarianceRandomizePercent +//#region QwenImageComponentSource +const QwenImageComponentSource: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageComponentSource', + parse: async (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_component_source'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + return Promise.resolve(zModelIdentifierField.parse(raw)); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageComponentSourceSelected(value)); + }, + i18nKey: 'modelManager.qwenImageComponentSource', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageComponentSource + +//#region QwenImageQuantization +const QwenImageQuantization: SingleMetadataHandler<'none' | 'int8' | 'nf4'> = { + [SingleMetadataKey]: true, + type: 'QwenImageQuantization', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_quantization'); + const parsed = z.enum(['none', 'int8', 'nf4']).parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve('none' as const); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageQuantizationChanged(value)); + }, + i18nKey: 'modelManager.qwenImageQuantization', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps<'none' | 'int8' | 'nf4'>) => ( + + ), +}; +//#endregion QwenImageQuantization + +//#region QwenImageShift +const QwenImageShift: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageShift', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_shift'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + const parsed = z.number().parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageShiftChanged(value)); + }, + i18nKey: 'modelManager.qwenImageShift', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageShift + //#region RefinerModel const RefinerModel: SingleMetadataHandler = { [SingleMetadataKey]: true, @@ -1233,6 +1313,9 @@ export const ImageMetadataHandlers = { ZImageSeedVarianceEnabled, ZImageSeedVarianceStrength, ZImageSeedVarianceRandomizePercent, + QwenImageComponentSource, + QwenImageQuantization, + QwenImageShift, LoRAs, CanvasLayers, RefImages, diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index 28a9b253485..8f1cb5362cd 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -167,6 +167,9 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise Date: Sat, 28 Mar 2026 01:12:12 -0400 Subject: [PATCH 10/18] fix: remove unnecessary async from QwenImageComponentSource parse Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/frontend/web/src/features/metadata/parsing.tsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/invokeai/frontend/web/src/features/metadata/parsing.tsx b/invokeai/frontend/web/src/features/metadata/parsing.tsx index 58f3aaab07d..4f179d6b017 100644 --- a/invokeai/frontend/web/src/features/metadata/parsing.tsx +++ b/invokeai/frontend/web/src/features/metadata/parsing.tsx @@ -13,6 +13,9 @@ import { kleinVaeModelSelected, negativePromptChanged, positivePromptChanged, + qwenImageComponentSourceSelected, + qwenImageQuantizationChanged, + qwenImageShiftChanged, refinerModelChanged, selectBase, setCfgRescaleMultiplier, @@ -39,9 +42,6 @@ import { setZImageSeedVarianceEnabled, setZImageSeedVarianceRandomizePercent, setZImageSeedVarianceStrength, - qwenImageComponentSourceSelected, - qwenImageQuantizationChanged, - qwenImageShiftChanged, vaeSelected, widthChanged, zImageQwen3EncoderModelSelected, @@ -684,7 +684,7 @@ const ZImageSeedVarianceRandomizePercent: SingleMetadataHandler = { const QwenImageComponentSource: SingleMetadataHandler = { [SingleMetadataKey]: true, type: 'QwenImageComponentSource', - parse: async (metadata, _store) => { + parse: (metadata, _store) => { try { const raw = getProperty(metadata, 'qwen_image_component_source'); if (raw === null || raw === undefined) { From 2aeb2fdd3a57ecc3d8003b6a761983aaa7e3c481 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 01:30:52 -0400 Subject: [PATCH 11/18] fix: prevent Flux LoRAs from being detected as Qwen Image LoRAs Flux PEFT LoRAs use transformer.single_transformer_blocks.* keys which contain "transformer_blocks." as a substring, falsely matching the Qwen Image LoRA detection. Add single_transformer_blocks to the Flux exclusion set. Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/backend/model_manager/configs/lora.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/invokeai/backend/model_manager/configs/lora.py b/invokeai/backend/model_manager/configs/lora.py index a5b9f40631d..f305bbddee8 100644 --- a/invokeai/backend/model_manager/configs/lora.py +++ b/invokeai/backend/model_manager/configs/lora.py @@ -775,14 +775,19 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: state_dict, {"lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", "dora_scale"}, ) - # Must NOT have diffusion_model.layers (Z-Image) or double_blocks/single_blocks (Flux) + # Must NOT have diffusion_model.layers (Z-Image) or Flux-style keys. + # Flux LoRAs can have transformer.single_transformer_blocks or transformer.transformer_blocks + # (with the "transformer." prefix and "single_" variant) which would falsely match our check. has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + {"double_blocks.", "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks."}, + ) if has_qwen_ie_keys and has_lora_suffix and not has_z_image_keys and not has_flux_keys: return - raise NotAMatchError("model does not match Qwen Image Edit LoRA heuristics") + raise NotAMatchError("model does not match Qwen Image LoRA heuristics") @classmethod def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: @@ -791,7 +796,10 @@ def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: state_dict, {"transformer_blocks.", "transformer.transformer_blocks."} ) has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + {"double_blocks.", "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks."}, + ) if has_qwen_ie_keys and not has_z_image_keys and not has_flux_keys: return BaseModelType.QwenImage From 5c6ca302b037edc0f0947aa5c0228141a80a1ed3 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 01:41:33 -0400 Subject: [PATCH 12/18] chore: ruff --- invokeai/backend/model_manager/configs/lora.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/invokeai/backend/model_manager/configs/lora.py b/invokeai/backend/model_manager/configs/lora.py index f305bbddee8..f2e6f3b34fa 100644 --- a/invokeai/backend/model_manager/configs/lora.py +++ b/invokeai/backend/model_manager/configs/lora.py @@ -781,7 +781,12 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) has_flux_keys = state_dict_has_any_keys_starting_with( state_dict, - {"double_blocks.", "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks."}, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + }, ) if has_qwen_ie_keys and has_lora_suffix and not has_z_image_keys and not has_flux_keys: @@ -798,7 +803,12 @@ def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) has_flux_keys = state_dict_has_any_keys_starting_with( state_dict, - {"double_blocks.", "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks."}, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + }, ) if has_qwen_ie_keys and not has_z_image_keys and not has_flux_keys: From 2fcedc72dfa091ad169e5927ac5d3ac779e4a96b Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sat, 28 Mar 2026 10:15:08 -0400 Subject: [PATCH 13/18] fix: don't force reference image to output aspect ratio in VAE encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the graph builder passed the output canvas dimensions to the I2L node, which resized the reference image to match — distorting its aspect ratio when they differed. Now the reference is encoded at its native size. The denoise node already handles dimension mismatches via bilinear interpolation in latent space. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../util/graph/generation/buildQwenImageGraph.ts | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index 8f1cb5362cd..1ea20a377e6 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -15,11 +15,7 @@ import { addQwenImageLoRAs } from 'features/nodes/util/graph/generation/addQwenI import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage'; import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker'; import { Graph } from 'features/nodes/util/graph/generation/Graph'; -import { - getOriginalAndScaledSizesForTextToImage, - selectCanvasOutputFields, - selectPresetModifiedPrompts, -} from 'features/nodes/util/graph/graphBuilderUtils'; +import { selectCanvasOutputFields, selectPresetModifiedPrompts } from 'features/nodes/util/graph/graphBuilderUtils'; import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types'; import { selectActiveTab } from 'features/ui/store/uiSelectors'; import type { Invocation } from 'services/api/types'; @@ -139,14 +135,12 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise Date: Sat, 28 Mar 2026 22:43:09 -0400 Subject: [PATCH 14/18] fix: clip denoise schedule by denoising_start/end, block GGUF enqueue without component source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two reviewer findings: 1. denoising_start/denoising_end were ignored — the full sigma schedule was always used regardless of img2img strength. Now clip the scheduler's sigmas to the fractional range before stepping, and use manual Euler steps with the clipped schedule (scheduler.step() can't handle clipped schedules due to internal index tracking). 2. GGUF Qwen Image models could be enqueued without a Component Source, deferring the error to runtime. Added readiness checks on both the Generate and Canvas tabs that block enqueue when a GGUF model is selected but no Diffusers component source is configured. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/invocations/qwen_image_denoise.py | 25 ++++++++++++++----- invokeai/frontend/web/public/locales/en.json | 1 + .../web/src/features/queue/store/readiness.ts | 18 +++++++++++++ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index 4b9fb207680..7e6e91ad0b1 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -304,8 +304,19 @@ def _run_diffusion(self, context: InvocationContext): init_sigmas = np.linspace(1.0, 1.0 / self.steps, self.steps).tolist() scheduler.set_timesteps(sigmas=init_sigmas, mu=mu, device=device) - timesteps_sched = scheduler.timesteps - sigmas_sched = scheduler.sigmas + # Clip the schedule based on denoising_start/denoising_end to support img2img strength. + # The scheduler's sigmas go from high (noisy) to 0 (clean). We clip to the fractional range. + sigmas_sched = scheduler.sigmas # (N+1,) including terminal 0 + if self.denoising_start > 0 or self.denoising_end < 1: + total_sigmas = len(sigmas_sched) - 1 # exclude terminal + start_idx = int(round(self.denoising_start * total_sigmas)) + end_idx = int(round(self.denoising_end * total_sigmas)) + sigmas_sched = sigmas_sched[start_idx : end_idx + 1] # +1 to include the next sigma for dt + # Rebuild timesteps from clipped sigmas (exclude terminal 0) + timesteps_sched = sigmas_sched[:-1] * scheduler.config.num_train_timesteps + else: + timesteps_sched = scheduler.timesteps + total_steps = len(timesteps_sched) cfg_scale = self._prepare_cfg_scale(total_steps) @@ -437,8 +448,6 @@ def _run_diffusion(self, context: InvocationContext): ) ) - scheduler.set_begin_index(0) - for step_idx, t in enumerate(tqdm(timesteps_sched)): # The pipeline passes timestep / 1000 to the transformer timestep = t.expand(latents.shape[0]).to(inference_dtype) @@ -476,8 +485,12 @@ def _run_diffusion(self, context: InvocationContext): else: noise_pred = noise_pred_cond - # Use the scheduler's step method — exactly matching the pipeline - latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0] + # Euler step using the (possibly clipped) sigma schedule + sigma_curr = sigmas_sched[step_idx] + sigma_next = sigmas_sched[step_idx + 1] + dt = sigma_next - sigma_curr + latents = latents.to(torch.float32) + dt * noise_pred.to(torch.float32) + latents = latents.to(inference_dtype) if inpaint_extension is not None: sigma_next = sigmas_sched[step_idx + 1].item() diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index 408caecc982..e5121b1cfa5 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -1501,6 +1501,7 @@ "noFLUXVAEModelSelected": "No VAE model selected for FLUX generation", "noCLIPEmbedModelSelected": "No CLIP Embed model selected for FLUX generation", "noQwen3EncoderModelSelected": "No Qwen3 Encoder model selected for FLUX2 Klein generation", + "noQwenImageComponentSourceSelected": "GGUF Qwen Image models require a Diffusers Component Source for VAE/encoder", "noZImageVaeSourceSelected": "No VAE source: Select VAE (FLUX) or Qwen3 Source model", "noZImageQwen3EncoderSourceSelected": "No Qwen3 Encoder source: Select Qwen3 Encoder or Qwen3 Source model", "fluxModelIncompatibleBboxWidth": "$t(parameters.invoke.fluxRequiresDimensionsToBeMultipleOf16), bbox width is {{width}}", diff --git a/invokeai/frontend/web/src/features/queue/store/readiness.ts b/invokeai/frontend/web/src/features/queue/store/readiness.ts index 6fc0376208f..60e32154d40 100644 --- a/invokeai/frontend/web/src/features/queue/store/readiness.ts +++ b/invokeai/frontend/web/src/features/queue/store/readiness.ts @@ -257,6 +257,18 @@ const getReasonsWhyCannotEnqueueGenerateTab = (arg: { // FLUX.2 (Klein) extracts Qwen3 encoder and VAE from main model - no separate selections needed + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { + if (!params.qwenImageComponentSource) { + reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); + } + } + + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { + if (!params.qwenImageComponentSource) { + reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); + } + } + if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; @@ -680,6 +692,12 @@ const getReasonsWhyCannotEnqueueCanvasTab = (arg: { } } + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { + if (!params.qwenImageComponentSource) { + reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); + } + } + if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; From bf9addbf3c803c95d3e4444ce0557ffbdd438cfc Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 29 Mar 2026 22:05:13 -0400 Subject: [PATCH 15/18] refactor: rename Qwen Image Edit node titles/descriptions to Qwen Image All invocation nodes work with both Qwen Image (txt2img) and Qwen Image Edit models. Rename titles and docstrings from "Qwen Image Edit" to "Qwen Image" to avoid confusion. Also remove duplicate GGUF readiness check in the Generate tab. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../app/invocations/qwen_image_denoise.py | 6 ++-- .../qwen_image_image_to_latents.py | 6 ++-- .../qwen_image_latents_to_image.py | 6 ++-- .../app/invocations/qwen_image_lora_loader.py | 10 +++--- .../invocations/qwen_image_model_loader.py | 10 +++--- .../invocations/qwen_image_text_encoder.py | 4 +-- .../web/src/features/queue/store/readiness.ts | 6 ---- .../frontend/web/src/services/api/schema.ts | 34 +++++++++---------- 8 files changed, 38 insertions(+), 44 deletions(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index 7e6e91ad0b1..11c0a2e0e8e 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -36,14 +36,14 @@ @invocation( "qwen_image_denoise", - title="Denoise - Qwen Image Edit", + title="Denoise - Qwen Image", tags=["image", "qwen_image"], category="image", version="1.0.0", classification=Classification.Prototype, ) class QwenImageDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): - """Run the denoising process with a Qwen Image Edit model.""" + """Run the denoising process with a Qwen Image model.""" # If latents is provided, this means we are doing image-to-image. latents: Optional[LatentsField] = InputField( @@ -270,7 +270,7 @@ def _run_diffusion(self, context: InvocationContext): # Try to load the scheduler config from the model's directory (Diffusers models # have a scheduler/ subdir). For GGUF models this path doesn't exist, so fall - # back to instantiating the scheduler with the known Qwen Image Edit defaults. + # back to instantiating the scheduler with the known Qwen Image defaults. model_path = context.models.get_absolute_path(context.models.get_config(self.transformer.transformer)) scheduler_path = model_path / "scheduler" if scheduler_path.is_dir() and (scheduler_path / "scheduler_config.json").exists(): diff --git a/invokeai/app/invocations/qwen_image_image_to_latents.py b/invokeai/app/invocations/qwen_image_image_to_latents.py index 19d233a7073..c5fe1b5d5c8 100644 --- a/invokeai/app/invocations/qwen_image_image_to_latents.py +++ b/invokeai/app/invocations/qwen_image_image_to_latents.py @@ -22,14 +22,14 @@ @invocation( "qwen_image_i2l", - title="Image to Latents - Qwen Image Edit", + title="Image to Latents - Qwen Image", tags=["image", "latents", "vae", "i2l", "qwen_image"], category="image", version="1.0.0", classification=Classification.Prototype, ) class QwenImageImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBoard): - """Generates latents from an image using the Qwen Image Edit VAE.""" + """Generates latents from an image using the Qwen Image VAE.""" image: ImageField = InputField(description="The image to encode.") vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection) @@ -51,7 +51,7 @@ def vae_encode(vae_info: LoadedModel, image_tensor: torch.Tensor) -> torch.Tenso image_tensor = image_tensor.to(device=TorchDevice.choose_torch_device(), dtype=vae.dtype) with torch.inference_mode(): - # The Qwen Image Edit VAE expects 5D input: (B, C, num_frames, H, W) + # The Qwen Image VAE expects 5D input: (B, C, num_frames, H, W) if image_tensor.dim() == 4: image_tensor = image_tensor.unsqueeze(2) diff --git a/invokeai/app/invocations/qwen_image_latents_to_image.py b/invokeai/app/invocations/qwen_image_latents_to_image.py index f1bce204806..b3ea39c4bbf 100644 --- a/invokeai/app/invocations/qwen_image_latents_to_image.py +++ b/invokeai/app/invocations/qwen_image_latents_to_image.py @@ -23,14 +23,14 @@ @invocation( "qwen_image_l2i", - title="Latents to Image - Qwen Image Edit", + title="Latents to Image - Qwen Image", tags=["latents", "image", "vae", "l2i", "qwen_image"], category="latents", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard): - """Generates an image from latents using the Qwen Image Edit VAE.""" + """Generates an image from latents using the Qwen Image VAE.""" latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection) @@ -56,7 +56,7 @@ def invoke(self, context: InvocationContext) -> ImageOutput: TorchDevice.empty_cache() with torch.inference_mode(), tiling_context: - # The Qwen Image Edit VAE uses per-channel latents_mean / latents_std + # The Qwen Image VAE uses per-channel latents_mean / latents_std # instead of a single scaling_factor. # Latents are 5D: (B, C, num_frames, H, W) — the unpack from the # denoise step already produces this shape. diff --git a/invokeai/app/invocations/qwen_image_lora_loader.py b/invokeai/app/invocations/qwen_image_lora_loader.py index fb056166153..f670b2d8954 100644 --- a/invokeai/app/invocations/qwen_image_lora_loader.py +++ b/invokeai/app/invocations/qwen_image_lora_loader.py @@ -15,7 +15,7 @@ @invocation_output("qwen_image_lora_loader_output") class QwenImageLoRALoaderOutput(BaseInvocationOutput): - """Qwen Image Edit LoRA Loader Output""" + """Qwen Image LoRA Loader Output""" transformer: Optional[TransformerField] = OutputField( default=None, description=FieldDescriptions.transformer, title="Transformer" @@ -24,14 +24,14 @@ class QwenImageLoRALoaderOutput(BaseInvocationOutput): @invocation( "qwen_image_lora_loader", - title="Apply LoRA - Qwen Image Edit", + title="Apply LoRA - Qwen Image", tags=["lora", "model", "qwen_image"], category="model", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLoRALoaderInvocation(BaseInvocation): - """Apply a LoRA model to a Qwen Image Edit transformer.""" + """Apply a LoRA model to a Qwen Image transformer.""" lora: ModelIdentifierField = InputField( description=FieldDescriptions.lora_model, @@ -72,14 +72,14 @@ def invoke(self, context: InvocationContext) -> QwenImageLoRALoaderOutput: @invocation( "qwen_image_lora_collection_loader", - title="Apply LoRA Collection - Qwen Image Edit", + title="Apply LoRA Collection - Qwen Image", tags=["lora", "model", "qwen_image"], category="model", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLoRACollectionLoader(BaseInvocation): - """Applies a collection of LoRAs to a Qwen Image Edit transformer.""" + """Applies a collection of LoRAs to a Qwen Image transformer.""" loras: Optional[LoRAField | list[LoRAField]] = InputField( default=None, description="LoRA models and weights. May be a single LoRA or collection.", title="LoRAs" diff --git a/invokeai/app/invocations/qwen_image_model_loader.py b/invokeai/app/invocations/qwen_image_model_loader.py index e2d21820b05..fd96067f561 100644 --- a/invokeai/app/invocations/qwen_image_model_loader.py +++ b/invokeai/app/invocations/qwen_image_model_loader.py @@ -20,7 +20,7 @@ @invocation_output("qwen_image_model_loader_output") class QwenImageModelLoaderOutput(BaseInvocationOutput): - """Qwen Image Edit base model loader output.""" + """Qwen Image model loader output.""" transformer: TransformerField = OutputField(description=FieldDescriptions.transformer, title="Transformer") qwen_vl_encoder: QwenVLEncoderField = OutputField( @@ -31,14 +31,14 @@ class QwenImageModelLoaderOutput(BaseInvocationOutput): @invocation( "qwen_image_model_loader", - title="Main Model - Qwen Image Edit", + title="Main Model - Qwen Image", tags=["model", "qwen_image"], category="model", version="1.1.0", classification=Classification.Prototype, ) class QwenImageModelLoaderInvocation(BaseInvocation): - """Loads a Qwen Image Edit model, outputting its submodels. + """Loads a Qwen Image model, outputting its submodels. The transformer is always loaded from the main model (Diffusers or GGUF). @@ -59,7 +59,7 @@ class QwenImageModelLoaderInvocation(BaseInvocation): component_source: Optional[ModelIdentifierField] = InputField( default=None, - description="Diffusers Qwen Image Edit model to extract the VAE and Qwen VL encoder from. " + description="Diffusers Qwen Image model to extract the VAE and Qwen VL encoder from. " "Required when using a GGUF quantized transformer. " "Ignored when the main model is already in Diffusers format.", input=Input.Direct, @@ -96,7 +96,7 @@ def invoke(self, context: InvocationContext) -> QwenImageModelLoaderOutput: raise ValueError( "No source for VAE and Qwen VL encoder. " "GGUF quantized models only contain the transformer — " - "please set 'Component Source' to a Diffusers Qwen Image Edit model " + "please set 'Component Source' to a Diffusers Qwen Image model " "to provide the VAE and text encoder." ) diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index 9e3f5723ba5..a067421452e 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -63,14 +63,14 @@ def _build_prompt(user_prompt: str, num_images: int) -> str: @invocation( "qwen_image_text_encoder", - title="Prompt - Qwen Image Edit", + title="Prompt - Qwen Image", tags=["prompt", "conditioning", "qwen_image"], category="conditioning", version="1.2.0", classification=Classification.Prototype, ) class QwenImageTextEncoderInvocation(BaseInvocation): - """Encodes text and reference images for Qwen Image Edit using Qwen2.5-VL.""" + """Encodes text and reference images for Qwen Image using Qwen2.5-VL.""" prompt: str = InputField(description="Text prompt describing the desired edit.", ui_component=UIComponent.Textarea) reference_images: list[ImageField] = InputField( diff --git a/invokeai/frontend/web/src/features/queue/store/readiness.ts b/invokeai/frontend/web/src/features/queue/store/readiness.ts index 60e32154d40..3f5a46c6381 100644 --- a/invokeai/frontend/web/src/features/queue/store/readiness.ts +++ b/invokeai/frontend/web/src/features/queue/store/readiness.ts @@ -263,12 +263,6 @@ const getReasonsWhyCannotEnqueueGenerateTab = (arg: { } } - if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { - if (!params.qwenImageComponentSource) { - reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); - } - } - if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index 2a8a3d243b7..b598719989a 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -22749,8 +22749,8 @@ export type components = { type: "qwen_image_conditioning_output"; }; /** - * Denoise - Qwen Image Edit - * @description Run the denoising process with a Qwen Image Edit model. + * Denoise - Qwen Image + * @description Run the denoising process with a Qwen Image model. */ QwenImageDenoiseInvocation: { /** @@ -22867,8 +22867,8 @@ export type components = { type: "qwen_image_denoise"; }; /** - * Image to Latents - Qwen Image Edit - * @description Generates latents from an image using the Qwen Image Edit VAE. + * Image to Latents - Qwen Image + * @description Generates latents from an image using the Qwen Image VAE. */ QwenImageImageToLatentsInvocation: { /** @@ -22928,8 +22928,8 @@ export type components = { type: "qwen_image_i2l"; }; /** - * Latents to Image - Qwen Image Edit - * @description Generates an image from latents using the Qwen Image Edit VAE. + * Latents to Image - Qwen Image + * @description Generates an image from latents using the Qwen Image VAE. */ QwenImageLatentsToImageInvocation: { /** @@ -22977,8 +22977,8 @@ export type components = { type: "qwen_image_l2i"; }; /** - * Apply LoRA Collection - Qwen Image Edit - * @description Applies a collection of LoRAs to a Qwen Image Edit transformer. + * Apply LoRA Collection - Qwen Image + * @description Applies a collection of LoRAs to a Qwen Image transformer. */ QwenImageLoRACollectionLoader: { /** @@ -23018,8 +23018,8 @@ export type components = { type: "qwen_image_lora_collection_loader"; }; /** - * Apply LoRA - Qwen Image Edit - * @description Apply a LoRA model to a Qwen Image Edit transformer. + * Apply LoRA - Qwen Image + * @description Apply a LoRA model to a Qwen Image transformer. */ QwenImageLoRALoaderInvocation: { /** @@ -23066,7 +23066,7 @@ export type components = { }; /** * QwenImageLoRALoaderOutput - * @description Qwen Image Edit LoRA Loader Output + * @description Qwen Image LoRA Loader Output */ QwenImageLoRALoaderOutput: { /** @@ -23083,8 +23083,8 @@ export type components = { type: "qwen_image_lora_loader_output"; }; /** - * Main Model - Qwen Image Edit - * @description Loads a Qwen Image Edit model, outputting its submodels. + * Main Model - Qwen Image + * @description Loads a Qwen Image model, outputting its submodels. * * The transformer is always loaded from the main model (Diffusers or GGUF). * @@ -23119,7 +23119,7 @@ export type components = { model: components["schemas"]["ModelIdentifierField"]; /** * Component Source (Diffusers) - * @description Diffusers Qwen Image Edit model to extract the VAE and Qwen VL encoder from. Required when using a GGUF quantized transformer. Ignored when the main model is already in Diffusers format. + * @description Diffusers Qwen Image model to extract the VAE and Qwen VL encoder from. Required when using a GGUF quantized transformer. Ignored when the main model is already in Diffusers format. * @default null */ component_source?: components["schemas"]["ModelIdentifierField"] | null; @@ -23132,7 +23132,7 @@ export type components = { }; /** * QwenImageModelLoaderOutput - * @description Qwen Image Edit base model loader output. + * @description Qwen Image model loader output. */ QwenImageModelLoaderOutput: { /** @@ -23158,8 +23158,8 @@ export type components = { type: "qwen_image_model_loader_output"; }; /** - * Prompt - Qwen Image Edit - * @description Encodes text and reference images for Qwen Image Edit using Qwen2.5-VL. + * Prompt - Qwen Image + * @description Encodes text and reference images for Qwen Image using Qwen2.5-VL. */ QwenImageTextEncoderInvocation: { /** From b24e170465ad69a53a1c1ce6963ff481f50a3305 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Sun, 29 Mar 2026 23:03:36 -0400 Subject: [PATCH 16/18] fix: skip negative text encoder node when CFG <= 1 The negative conditioning node was always added to the graph, causing the text encoder to be loaded twice even when CFG=1 (where the negative prediction is unused). Now only adds the negative node when cfg_scale > 1. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../graph/generation/buildQwenImageGraph.ts | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index 1ea20a377e6..336766e5cea 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -59,13 +59,16 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise 1) + const useCfg = typeof cfg_scale === 'number' ? cfg_scale > 1 : true; + const negCond = useCfg + ? g.addNode({ + type: 'qwen_image_text_encoder', + id: getPrefixedId('neg_prompt'), + prompt: prompts.negative || ' ', + quantization: params.qwenImageQuantization, + }) + : null; const seed = g.addNode({ id: getPrefixedId('seed'), @@ -85,13 +88,15 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise Date: Mon, 30 Mar 2026 09:02:21 -0400 Subject: [PATCH 17/18] feat: support Kohya-format Qwen Image LoRAs (LoKR) Kohya LoRAs use underscore-separated keys like lora_unet_transformer_blocks_0_attn_to_k.lokr_w1 instead of the diffusers dot-separated format. Add: - Kohya key detection (lora_unet_transformer_blocks_*) - Key conversion mapping from Kohya underscores to model dot-paths - Updated LoRA config detection to recognize Kohya format + LoKR suffixes - Flux Kohya exclusion (lora_unet_double_blocks, lora_unet_single_blocks) - Test model for Kohya LoKR identification Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/model_manager/configs/lora.py | 21 ++- .../qwen_image_lora_conversion_utils.py | 122 ++++++++++++++++-- .../__test_metadata__.json | 3 + .../qwen_image_kohya_lokr_test.safetensors | 3 + 4 files changed, 134 insertions(+), 15 deletions(-) create mode 100644 tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json create mode 100644 tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors diff --git a/invokeai/backend/model_manager/configs/lora.py b/invokeai/backend/model_manager/configs/lora.py index f2e6f3b34fa..05698a3c33a 100644 --- a/invokeai/backend/model_manager/configs/lora.py +++ b/invokeai/backend/model_manager/configs/lora.py @@ -769,15 +769,23 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: has_qwen_ie_keys = state_dict_has_any_keys_starting_with( state_dict, - {"transformer_blocks.", "transformer.transformer_blocks."}, + { + "transformer_blocks.", + "transformer.transformer_blocks.", + "lora_unet_transformer_blocks_", # Kohya format + }, ) has_lora_suffix = state_dict_has_any_keys_ending_with( state_dict, - {"lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", "dora_scale"}, + { + "lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", + "dora_scale", "lokr_w1", "lokr_w2", # LoKR format + }, ) # Must NOT have diffusion_model.layers (Z-Image) or Flux-style keys. # Flux LoRAs can have transformer.single_transformer_blocks or transformer.transformer_blocks # (with the "transformer." prefix and "single_" variant) which would falsely match our check. + # Flux Kohya LoRAs use lora_unet_double_blocks or lora_unet_single_blocks. has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) has_flux_keys = state_dict_has_any_keys_starting_with( state_dict, @@ -786,6 +794,9 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks.", + "lora_unet_double_blocks_", + "lora_unet_single_blocks_", + "lora_unet_single_transformer_blocks_", }, ) @@ -798,7 +809,8 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: state_dict = mod.load_state_dict() has_qwen_ie_keys = state_dict_has_any_keys_starting_with( - state_dict, {"transformer_blocks.", "transformer.transformer_blocks."} + state_dict, + {"transformer_blocks.", "transformer.transformer_blocks.", "lora_unet_transformer_blocks_"}, ) has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) has_flux_keys = state_dict_has_any_keys_starting_with( @@ -808,6 +820,9 @@ def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: "single_blocks.", "single_transformer_blocks.", "transformer.single_transformer_blocks.", + "lora_unet_double_blocks_", + "lora_unet_single_blocks_", + "lora_unet_single_transformer_blocks_", }, ) diff --git a/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py b/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py index 7488e0e72e3..df8aa2ef566 100644 --- a/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py +++ b/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py @@ -1,9 +1,13 @@ -"""Qwen Image Edit LoRA conversion utilities. +"""Qwen Image LoRA conversion utilities. -Qwen Image Edit uses QwenImageTransformer2DModel architecture. -LoRAs follow the standard format with lora_down.weight/lora_up.weight/alpha keys. +Qwen Image uses QwenImageTransformer2DModel architecture. +Supports multiple LoRA formats: +- Diffusers/PEFT: transformer_blocks.0.attn.to_k.lora_down.weight +- With prefix: transformer.transformer_blocks.0.attn.to_k.lora_down.weight +- Kohya: lora_unet_transformer_blocks_0_attn_to_k.lora_down.weight (underscores instead of dots) """ +import re from typing import Dict import torch @@ -15,23 +19,117 @@ ) from invokeai.backend.patches.model_patch_raw import ModelPatchRaw +# Regex for Kohya-format Qwen Image LoRA keys. +# Example: lora_unet_transformer_blocks_0_attn_to_k +# Groups: (block_idx, sub_module_with_underscores) +_KOHYA_KEY_REGEX = re.compile(r"lora_unet_transformer_blocks_(\d+)_(.*)") + +# Mapping from Kohya underscore-separated sub-module names to dot-separated model paths. +# The Kohya format uses underscores everywhere, but some underscores are part of the +# module name (e.g., add_k_proj, to_out). We match the longest prefix first. +_KOHYA_MODULE_MAP: list[tuple[str, str]] = [ + # Attention projections + ("attn_add_k_proj", "attn.add_k_proj"), + ("attn_add_q_proj", "attn.add_q_proj"), + ("attn_add_v_proj", "attn.add_v_proj"), + ("attn_to_add_out", "attn.to_add_out"), + ("attn_to_out_0", "attn.to_out.0"), + ("attn_to_k", "attn.to_k"), + ("attn_to_q", "attn.to_q"), + ("attn_to_v", "attn.to_v"), + # Image stream MLP and modulation + ("img_mlp_net_0_proj", "img_mlp.net.0.proj"), + ("img_mlp_net_2", "img_mlp.net.2"), + ("img_mod_1", "img_mod.1"), + # Text stream MLP and modulation + ("txt_mlp_net_0_proj", "txt_mlp.net.0.proj"), + ("txt_mlp_net_2", "txt_mlp.net.2"), + ("txt_mod_1", "txt_mod.1"), +] + + +def is_state_dict_likely_kohya_qwen_image(state_dict: dict[str | int, torch.Tensor]) -> bool: + """Check if the state dict uses Kohya-format Qwen Image LoRA keys.""" + str_keys = [k for k in state_dict.keys() if isinstance(k, str)] + if not str_keys: + return False + # Check if any key matches the Kohya pattern + return any(k.startswith("lora_unet_transformer_blocks_") for k in str_keys) + + +def _convert_kohya_key(kohya_layer: str) -> str | None: + """Convert a Kohya-format layer name to a dot-separated model module path. + + Example: lora_unet_transformer_blocks_0_attn_to_k -> transformer_blocks.0.attn.to_k + """ + m = _KOHYA_KEY_REGEX.match(kohya_layer) + if not m: + return None + + block_idx = m.group(1) + sub_module = m.group(2) + + for kohya_name, model_path in _KOHYA_MODULE_MAP: + if sub_module == kohya_name: + return f"transformer_blocks.{block_idx}.{model_path}" + + # Fallback: unknown sub-module, return None so caller can warn/skip + return None + def lora_model_from_qwen_image_state_dict( state_dict: Dict[str, torch.Tensor], alpha: float | None = None ) -> ModelPatchRaw: - """Convert a Qwen Image Edit LoRA state dict to a ModelPatchRaw. - - The Lightning LoRA keys are in the format: - transformer_blocks.0.attn.to_k.lora_down.weight - transformer_blocks.0.attn.to_k.lora_up.weight - transformer_blocks.0.attn.to_k.alpha + """Convert a Qwen Image LoRA state dict to a ModelPatchRaw. - These are already the correct module paths for QwenImageTransformer2DModel. + Handles three key formats: + - Diffusers/PEFT: transformer_blocks.0.attn.to_k.lora_down.weight + - With prefix: transformer.transformer_blocks.0.attn.to_k.lora_down.weight + - Kohya: lora_unet_transformer_blocks_0_attn_to_k.lora_down.weight """ + is_kohya = is_state_dict_likely_kohya_qwen_image(state_dict) + + if is_kohya: + return _convert_kohya_format(state_dict, alpha) + else: + return _convert_diffusers_format(state_dict, alpha) + + +def _convert_kohya_format( + state_dict: Dict[str, torch.Tensor], alpha: float | None +) -> ModelPatchRaw: + """Convert Kohya-format state dict. Keys are like lora_unet_transformer_blocks_0_attn_to_k.lokr_w1""" + layers: dict[str, BaseLayerPatch] = {} + + # Group by layer (split at first dot: layer_name.param_name) + grouped: dict[str, dict[str, torch.Tensor]] = {} + for key, value in state_dict.items(): + if not isinstance(key, str): + continue + layer_name, param_name = key.split(".", 1) + if layer_name not in grouped: + grouped[layer_name] = {} + grouped[layer_name][param_name] = value + + for kohya_layer, layer_dict in grouped.items(): + model_path = _convert_kohya_key(kohya_layer) + if model_path is None: + continue # Skip unrecognized layers + + layer = any_lora_layer_from_state_dict(layer_dict) + final_key = f"{QWEN_IMAGE_EDIT_LORA_TRANSFORMER_PREFIX}{model_path}" + layers[final_key] = layer + + return ModelPatchRaw(layers=layers) + + +def _convert_diffusers_format( + state_dict: Dict[str, torch.Tensor], alpha: float | None +) -> ModelPatchRaw: + """Convert Diffusers/PEFT format state dict.""" layers: dict[str, BaseLayerPatch] = {} - # Some LoRAs use a "transformer." prefix on keys (e.g. "transformer.transformer_blocks.0.attn.to_k") - # while the model's module paths start at "transformer_blocks.0.attn.to_k". Strip it if present. + # Some LoRAs use a "transformer." prefix on keys strip_prefixes = ["transformer."] grouped = _group_by_layer(state_dict) diff --git a/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json new file mode 100644 index 00000000000..5a41ffed04c --- /dev/null +++ b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32053abc6257adf4771405fddfdaed2b91497c7cd7b0ea6af0aa29f9e008ca2f +size 233 diff --git a/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors new file mode 100644 index 00000000000..6e34832a719 --- /dev/null +++ b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b3d666baf329c922be86eacd12517cf734514da91377787d2f3cbd2b1a017c0 +size 2910 From 13a4b7634ba7ea2232c9c5c11f087582d293c5e8 Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Wed, 1 Apr 2026 13:38:58 -0400 Subject: [PATCH 18/18] fix: generate noise in float32 instead of float16 Float16 noise has limited precision that creates quantization patterns visible as vertical ripple artifacts, especially with few-step Lightning LoRA generation where the denoiser doesn't have enough steps to smooth them out. Use float32 (matching Z-Image and the diffusers pipeline). Co-Authored-By: Claude Opus 4.6 (1M context) --- invokeai/app/invocations/qwen_image_denoise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index 11c0a2e0e8e..4c62c0ebec6 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -132,7 +132,7 @@ def _get_noise( seed: int, ) -> torch.Tensor: rand_device = "cpu" - rand_dtype = torch.float16 + rand_dtype = torch.float32 return torch.randn( batch_size,