diff --git a/invokeai/app/invocations/qwen_image_denoise.py b/invokeai/app/invocations/qwen_image_denoise.py index cd3ff917596..4c62c0ebec6 100644 --- a/invokeai/app/invocations/qwen_image_denoise.py +++ b/invokeai/app/invocations/qwen_image_denoise.py @@ -36,14 +36,14 @@ @invocation( "qwen_image_denoise", - title="Denoise - Qwen Image Edit", + title="Denoise - Qwen Image", tags=["image", "qwen_image"], category="image", version="1.0.0", classification=Classification.Prototype, ) class QwenImageDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): - """Run the denoising process with a Qwen Image Edit model.""" + """Run the denoising process with a Qwen Image model.""" # If latents is provided, this means we are doing image-to-image. latents: Optional[LatentsField] = InputField( @@ -132,7 +132,7 @@ def _get_noise( seed: int, ) -> torch.Tensor: rand_device = "cpu" - rand_dtype = torch.float16 + rand_dtype = torch.float32 return torch.randn( batch_size, @@ -270,7 +270,7 @@ def _run_diffusion(self, context: InvocationContext): # Try to load the scheduler config from the model's directory (Diffusers models # have a scheduler/ subdir). For GGUF models this path doesn't exist, so fall - # back to instantiating the scheduler with the known Qwen Image Edit defaults. + # back to instantiating the scheduler with the known Qwen Image defaults. model_path = context.models.get_absolute_path(context.models.get_config(self.transformer.transformer)) scheduler_path = model_path / "scheduler" if scheduler_path.is_dir() and (scheduler_path / "scheduler_config.json").exists(): @@ -304,8 +304,19 @@ def _run_diffusion(self, context: InvocationContext): init_sigmas = np.linspace(1.0, 1.0 / self.steps, self.steps).tolist() scheduler.set_timesteps(sigmas=init_sigmas, mu=mu, device=device) - timesteps_sched = scheduler.timesteps - sigmas_sched = scheduler.sigmas + # Clip the schedule based on denoising_start/denoising_end to support img2img strength. + # The scheduler's sigmas go from high (noisy) to 0 (clean). We clip to the fractional range. + sigmas_sched = scheduler.sigmas # (N+1,) including terminal 0 + if self.denoising_start > 0 or self.denoising_end < 1: + total_sigmas = len(sigmas_sched) - 1 # exclude terminal + start_idx = int(round(self.denoising_start * total_sigmas)) + end_idx = int(round(self.denoising_end * total_sigmas)) + sigmas_sched = sigmas_sched[start_idx : end_idx + 1] # +1 to include the next sigma for dt + # Rebuild timesteps from clipped sigmas (exclude terminal 0) + timesteps_sched = sigmas_sched[:-1] * scheduler.config.num_train_timesteps + else: + timesteps_sched = scheduler.timesteps + total_steps = len(timesteps_sched) cfg_scale = self._prepare_cfg_scale(total_steps) @@ -353,29 +364,44 @@ def _run_diffusion(self, context: InvocationContext): # Pack latents into 2x2 patches: (B, C, H, W) -> (B, H/2*W/2, C*4) latents = self._pack_latents(latents, 1, out_channels, latent_height, latent_width) - # Pack reference image latents and concatenate along the sequence dimension. - # The edit transformer always expects [noisy_patches ; ref_patches] in its sequence. - if ref_latents is not None: - _, ref_ch, rh, rw = ref_latents.shape - if rh != latent_height or rw != latent_width: - ref_latents = torch.nn.functional.interpolate( - ref_latents, size=(latent_height, latent_width), mode="bilinear" + # Determine whether the model uses reference latent conditioning (zero_cond_t). + # Edit models (zero_cond_t=True) expect [noisy_patches ; ref_patches] in the sequence. + # Txt2img models (zero_cond_t=False) only take noisy patches. + has_zero_cond_t = getattr(transformer_info.model, "zero_cond_t", False) or getattr( + transformer_info.model.config, "zero_cond_t", False + ) + use_ref_latents = has_zero_cond_t + + ref_latents_packed = None + if use_ref_latents: + if ref_latents is not None: + _, ref_ch, rh, rw = ref_latents.shape + if rh != latent_height or rw != latent_width: + ref_latents = torch.nn.functional.interpolate( + ref_latents, size=(latent_height, latent_width), mode="bilinear" + ) + else: + # No reference image provided — use zeros so the model still gets the + # expected sequence layout. + ref_latents = torch.zeros( + 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype ) + ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) + + # img_shapes tells the transformer the spatial layout of patches. + if use_ref_latents: + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + (1, latent_height // 2, latent_width // 2), + ] + ] else: - # No reference image provided — use zeros so the model still gets the - # expected sequence layout. - ref_latents = torch.zeros( - 1, out_channels, latent_height, latent_width, device=device, dtype=inference_dtype - ) - ref_latents_packed = self._pack_latents(ref_latents, 1, out_channels, latent_height, latent_width) - - # img_shapes tells the transformer the spatial layout of noisy and reference patches. - img_shapes = [ - [ - (1, latent_height // 2, latent_width // 2), - (1, latent_height // 2, latent_width // 2), + img_shapes = [ + [ + (1, latent_height // 2, latent_width // 2), + ] ] - ] # Prepare inpaint extension (operates in 4D space, so unpack/repack around it) inpaint_mask = self._prep_inpaint_mask(context, noise) # noise has the right 4D shape @@ -422,14 +448,16 @@ def _run_diffusion(self, context: InvocationContext): ) ) - scheduler.set_begin_index(0) - for step_idx, t in enumerate(tqdm(timesteps_sched)): # The pipeline passes timestep / 1000 to the transformer timestep = t.expand(latents.shape[0]).to(inference_dtype) - # Concatenate noisy and reference patches along the sequence dim - model_input = torch.cat([latents, ref_latents_packed], dim=1) + # For edit models: concatenate noisy and reference patches along the sequence dim + # For txt2img models: just use noisy patches + if ref_latents_packed is not None: + model_input = torch.cat([latents, ref_latents_packed], dim=1) + else: + model_input = latents noise_pred_cond = transformer( hidden_states=model_input, @@ -457,8 +485,12 @@ def _run_diffusion(self, context: InvocationContext): else: noise_pred = noise_pred_cond - # Use the scheduler's step method — exactly matching the pipeline - latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0] + # Euler step using the (possibly clipped) sigma schedule + sigma_curr = sigmas_sched[step_idx] + sigma_next = sigmas_sched[step_idx + 1] + dt = sigma_next - sigma_curr + latents = latents.to(torch.float32) + dt * noise_pred.to(torch.float32) + latents = latents.to(inference_dtype) if inpaint_extension is not None: sigma_next = sigmas_sched[step_idx + 1].item() diff --git a/invokeai/app/invocations/qwen_image_image_to_latents.py b/invokeai/app/invocations/qwen_image_image_to_latents.py index 19d233a7073..c5fe1b5d5c8 100644 --- a/invokeai/app/invocations/qwen_image_image_to_latents.py +++ b/invokeai/app/invocations/qwen_image_image_to_latents.py @@ -22,14 +22,14 @@ @invocation( "qwen_image_i2l", - title="Image to Latents - Qwen Image Edit", + title="Image to Latents - Qwen Image", tags=["image", "latents", "vae", "i2l", "qwen_image"], category="image", version="1.0.0", classification=Classification.Prototype, ) class QwenImageImageToLatentsInvocation(BaseInvocation, WithMetadata, WithBoard): - """Generates latents from an image using the Qwen Image Edit VAE.""" + """Generates latents from an image using the Qwen Image VAE.""" image: ImageField = InputField(description="The image to encode.") vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection) @@ -51,7 +51,7 @@ def vae_encode(vae_info: LoadedModel, image_tensor: torch.Tensor) -> torch.Tenso image_tensor = image_tensor.to(device=TorchDevice.choose_torch_device(), dtype=vae.dtype) with torch.inference_mode(): - # The Qwen Image Edit VAE expects 5D input: (B, C, num_frames, H, W) + # The Qwen Image VAE expects 5D input: (B, C, num_frames, H, W) if image_tensor.dim() == 4: image_tensor = image_tensor.unsqueeze(2) diff --git a/invokeai/app/invocations/qwen_image_latents_to_image.py b/invokeai/app/invocations/qwen_image_latents_to_image.py index f1bce204806..b3ea39c4bbf 100644 --- a/invokeai/app/invocations/qwen_image_latents_to_image.py +++ b/invokeai/app/invocations/qwen_image_latents_to_image.py @@ -23,14 +23,14 @@ @invocation( "qwen_image_l2i", - title="Latents to Image - Qwen Image Edit", + title="Latents to Image - Qwen Image", tags=["latents", "image", "vae", "l2i", "qwen_image"], category="latents", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard): - """Generates an image from latents using the Qwen Image Edit VAE.""" + """Generates an image from latents using the Qwen Image VAE.""" latents: LatentsField = InputField(description=FieldDescriptions.latents, input=Input.Connection) vae: VAEField = InputField(description=FieldDescriptions.vae, input=Input.Connection) @@ -56,7 +56,7 @@ def invoke(self, context: InvocationContext) -> ImageOutput: TorchDevice.empty_cache() with torch.inference_mode(), tiling_context: - # The Qwen Image Edit VAE uses per-channel latents_mean / latents_std + # The Qwen Image VAE uses per-channel latents_mean / latents_std # instead of a single scaling_factor. # Latents are 5D: (B, C, num_frames, H, W) — the unpack from the # denoise step already produces this shape. diff --git a/invokeai/app/invocations/qwen_image_lora_loader.py b/invokeai/app/invocations/qwen_image_lora_loader.py index fb056166153..f670b2d8954 100644 --- a/invokeai/app/invocations/qwen_image_lora_loader.py +++ b/invokeai/app/invocations/qwen_image_lora_loader.py @@ -15,7 +15,7 @@ @invocation_output("qwen_image_lora_loader_output") class QwenImageLoRALoaderOutput(BaseInvocationOutput): - """Qwen Image Edit LoRA Loader Output""" + """Qwen Image LoRA Loader Output""" transformer: Optional[TransformerField] = OutputField( default=None, description=FieldDescriptions.transformer, title="Transformer" @@ -24,14 +24,14 @@ class QwenImageLoRALoaderOutput(BaseInvocationOutput): @invocation( "qwen_image_lora_loader", - title="Apply LoRA - Qwen Image Edit", + title="Apply LoRA - Qwen Image", tags=["lora", "model", "qwen_image"], category="model", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLoRALoaderInvocation(BaseInvocation): - """Apply a LoRA model to a Qwen Image Edit transformer.""" + """Apply a LoRA model to a Qwen Image transformer.""" lora: ModelIdentifierField = InputField( description=FieldDescriptions.lora_model, @@ -72,14 +72,14 @@ def invoke(self, context: InvocationContext) -> QwenImageLoRALoaderOutput: @invocation( "qwen_image_lora_collection_loader", - title="Apply LoRA Collection - Qwen Image Edit", + title="Apply LoRA Collection - Qwen Image", tags=["lora", "model", "qwen_image"], category="model", version="1.0.0", classification=Classification.Prototype, ) class QwenImageLoRACollectionLoader(BaseInvocation): - """Applies a collection of LoRAs to a Qwen Image Edit transformer.""" + """Applies a collection of LoRAs to a Qwen Image transformer.""" loras: Optional[LoRAField | list[LoRAField]] = InputField( default=None, description="LoRA models and weights. May be a single LoRA or collection.", title="LoRAs" diff --git a/invokeai/app/invocations/qwen_image_model_loader.py b/invokeai/app/invocations/qwen_image_model_loader.py index e2d21820b05..fd96067f561 100644 --- a/invokeai/app/invocations/qwen_image_model_loader.py +++ b/invokeai/app/invocations/qwen_image_model_loader.py @@ -20,7 +20,7 @@ @invocation_output("qwen_image_model_loader_output") class QwenImageModelLoaderOutput(BaseInvocationOutput): - """Qwen Image Edit base model loader output.""" + """Qwen Image model loader output.""" transformer: TransformerField = OutputField(description=FieldDescriptions.transformer, title="Transformer") qwen_vl_encoder: QwenVLEncoderField = OutputField( @@ -31,14 +31,14 @@ class QwenImageModelLoaderOutput(BaseInvocationOutput): @invocation( "qwen_image_model_loader", - title="Main Model - Qwen Image Edit", + title="Main Model - Qwen Image", tags=["model", "qwen_image"], category="model", version="1.1.0", classification=Classification.Prototype, ) class QwenImageModelLoaderInvocation(BaseInvocation): - """Loads a Qwen Image Edit model, outputting its submodels. + """Loads a Qwen Image model, outputting its submodels. The transformer is always loaded from the main model (Diffusers or GGUF). @@ -59,7 +59,7 @@ class QwenImageModelLoaderInvocation(BaseInvocation): component_source: Optional[ModelIdentifierField] = InputField( default=None, - description="Diffusers Qwen Image Edit model to extract the VAE and Qwen VL encoder from. " + description="Diffusers Qwen Image model to extract the VAE and Qwen VL encoder from. " "Required when using a GGUF quantized transformer. " "Ignored when the main model is already in Diffusers format.", input=Input.Direct, @@ -96,7 +96,7 @@ def invoke(self, context: InvocationContext) -> QwenImageModelLoaderOutput: raise ValueError( "No source for VAE and Qwen VL encoder. " "GGUF quantized models only contain the transformer — " - "please set 'Component Source' to a Diffusers Qwen Image Edit model " + "please set 'Component Source' to a Diffusers Qwen Image model " "to provide the VAE and text encoder." ) diff --git a/invokeai/app/invocations/qwen_image_text_encoder.py b/invokeai/app/invocations/qwen_image_text_encoder.py index 641e8c4d388..a067421452e 100644 --- a/invokeai/app/invocations/qwen_image_text_encoder.py +++ b/invokeai/app/invocations/qwen_image_text_encoder.py @@ -20,39 +20,57 @@ QwenImageConditioningInfo, ) -# The Qwen Image Edit pipeline uses a specific system prompt and drops the first -# N tokens (the system prompt prefix) from the embeddings. These constants are -# taken directly from the diffusers QwenImagePipeline. -_SYSTEM_PROMPT = ( +# Prompt templates and drop indices for the two Qwen Image model modes. +# These are taken directly from the diffusers pipelines. + +# Image editing mode (QwenImagePipeline) +_EDIT_SYSTEM_PROMPT = ( "Describe the key features of the input image (color, shape, size, texture, objects, background), " "then explain how the user's text instruction should alter or modify the image. " "Generate a new image that meets the user's requirements while maintaining consistency " "with the original input where appropriate." ) +_EDIT_DROP_IDX = 64 + +# Text-to-image mode (QwenImagePipeline) +_GENERATE_SYSTEM_PROMPT = ( + "Describe the image by detailing the color, shape, size, texture, quantity, " + "text, spatial relationships of the objects and background:" +) +_GENERATE_DROP_IDX = 34 + _IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" -_DROP_IDX = 64 def _build_prompt(user_prompt: str, num_images: int) -> str: - """Build the full prompt with one vision placeholder per reference image.""" - image_tokens = _IMAGE_PLACEHOLDER * max(num_images, 1) - return ( - f"<|im_start|>system\n{_SYSTEM_PROMPT}<|im_end|>\n" - f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" - "<|im_start|>assistant\n" - ) + """Build the full prompt with the appropriate template based on whether reference images are provided.""" + if num_images > 0: + # Edit mode: include vision placeholders for reference images + image_tokens = _IMAGE_PLACEHOLDER * num_images + return ( + f"<|im_start|>system\n{_EDIT_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{image_tokens}{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + else: + # Generate mode: text-only prompt + return ( + f"<|im_start|>system\n{_GENERATE_SYSTEM_PROMPT}<|im_end|>\n" + f"<|im_start|>user\n{user_prompt}<|im_end|>\n" + "<|im_start|>assistant\n" + ) @invocation( "qwen_image_text_encoder", - title="Prompt - Qwen Image Edit", + title="Prompt - Qwen Image", tags=["prompt", "conditioning", "qwen_image"], category="conditioning", version="1.2.0", classification=Classification.Prototype, ) class QwenImageTextEncoderInvocation(BaseInvocation): - """Encodes text and reference images for Qwen Image Edit using Qwen2.5-VL.""" + """Encodes text and reference images for Qwen Image using Qwen2.5-VL.""" prompt: str = InputField(description="Text prompt describing the desired edit.", ui_component=UIComponent.Textarea) reference_images: list[ImageField] = InputField( @@ -188,7 +206,10 @@ def _encode( hidden_states = outputs.hidden_states[-1] # Extract valid (non-padding) tokens using the attention mask, - # then drop the first _DROP_IDX tokens (system prompt prefix). + # then drop the system prompt prefix tokens. + # The drop index differs between edit mode (64) and generate mode (34). + drop_idx = _EDIT_DROP_IDX if images else _GENERATE_DROP_IDX + attn_mask = model_inputs.attention_mask bool_mask = attn_mask.bool() valid_lengths = bool_mask.sum(dim=1) @@ -196,7 +217,7 @@ def _encode( split_hidden = torch.split(selected, valid_lengths.tolist(), dim=0) # Drop system prefix tokens and build padded output - trimmed = [h[_DROP_IDX:] for h in split_hidden] + trimmed = [h[drop_idx:] for h in split_hidden] attn_mask_list = [torch.ones(h.size(0), dtype=torch.long, device=device) for h in trimmed] max_seq_len = max(h.size(0) for h in trimmed) diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py index ea5b9ef7546..dcdc0ce5956 100644 --- a/invokeai/app/services/model_records/model_records_base.py +++ b/invokeai/app/services/model_records/model_records_base.py @@ -25,8 +25,8 @@ ModelSourceType, ModelType, ModelVariantType, - QwenImageVariantType, Qwen3VariantType, + QwenImageVariantType, SchedulerPredictionType, ZImageVariantType, ) @@ -95,7 +95,13 @@ class ModelRecordChanges(BaseModelExcludeNull): # Checkpoint-specific changes # TODO(MM2): Should we expose these? Feels footgun-y... variant: Optional[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType ] = Field(description="The variant of the model.", default=None) prediction_type: Optional[SchedulerPredictionType] = Field( description="The prediction type of the model.", default=None diff --git a/invokeai/backend/model_manager/configs/lora.py b/invokeai/backend/model_manager/configs/lora.py index a5b9f40631d..05698a3c33a 100644 --- a/invokeai/backend/model_manager/configs/lora.py +++ b/invokeai/backend/model_manager/configs/lora.py @@ -769,29 +769,62 @@ def _validate_looks_like_lora(cls, mod: ModelOnDisk) -> None: has_qwen_ie_keys = state_dict_has_any_keys_starting_with( state_dict, - {"transformer_blocks.", "transformer.transformer_blocks."}, + { + "transformer_blocks.", + "transformer.transformer_blocks.", + "lora_unet_transformer_blocks_", # Kohya format + }, ) has_lora_suffix = state_dict_has_any_keys_ending_with( state_dict, - {"lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", "dora_scale"}, + { + "lora_A.weight", "lora_B.weight", "lora_down.weight", "lora_up.weight", + "dora_scale", "lokr_w1", "lokr_w2", # LoKR format + }, ) - # Must NOT have diffusion_model.layers (Z-Image) or double_blocks/single_blocks (Flux) + # Must NOT have diffusion_model.layers (Z-Image) or Flux-style keys. + # Flux LoRAs can have transformer.single_transformer_blocks or transformer.transformer_blocks + # (with the "transformer." prefix and "single_" variant) which would falsely match our check. + # Flux Kohya LoRAs use lora_unet_double_blocks or lora_unet_single_blocks. has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + "lora_unet_double_blocks_", + "lora_unet_single_blocks_", + "lora_unet_single_transformer_blocks_", + }, + ) if has_qwen_ie_keys and has_lora_suffix and not has_z_image_keys and not has_flux_keys: return - raise NotAMatchError("model does not match Qwen Image Edit LoRA heuristics") + raise NotAMatchError("model does not match Qwen Image LoRA heuristics") @classmethod def _get_base_or_raise(cls, mod: ModelOnDisk) -> BaseModelType: state_dict = mod.load_state_dict() has_qwen_ie_keys = state_dict_has_any_keys_starting_with( - state_dict, {"transformer_blocks.", "transformer.transformer_blocks."} + state_dict, + {"transformer_blocks.", "transformer.transformer_blocks.", "lora_unet_transformer_blocks_"}, ) has_z_image_keys = state_dict_has_any_keys_starting_with(state_dict, {"diffusion_model.layers."}) - has_flux_keys = state_dict_has_any_keys_starting_with(state_dict, {"double_blocks.", "single_blocks."}) + has_flux_keys = state_dict_has_any_keys_starting_with( + state_dict, + { + "double_blocks.", + "single_blocks.", + "single_transformer_blocks.", + "transformer.single_transformer_blocks.", + "lora_unet_double_blocks_", + "lora_unet_single_blocks_", + "lora_unet_single_transformer_blocks_", + }, + ) if has_qwen_ie_keys and not has_z_image_keys and not has_flux_keys: return BaseModelType.QwenImage diff --git a/invokeai/backend/model_manager/configs/main.py b/invokeai/backend/model_manager/configs/main.py index 484a95f4bb8..6ec0611fdf3 100644 --- a/invokeai/backend/model_manager/configs/main.py +++ b/invokeai/backend/model_manager/configs/main.py @@ -1208,7 +1208,7 @@ class Main_Diffusers_QwenImage_Config(Diffusers_Config_Base, Main_Config_Base, C """Model config for Qwen Image diffusers models (both txt2img and edit).""" base: Literal[BaseModelType.QwenImage] = Field(BaseModelType.QwenImage) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: @@ -1269,7 +1269,7 @@ class Main_GGUF_QwenImage_Config(Checkpoint_Config_Base, Main_Config_Base, Confi base: Literal[BaseModelType.QwenImage] = Field(default=BaseModelType.QwenImage) format: Literal[ModelFormat.GGUFQuantized] = Field(default=ModelFormat.GGUFQuantized) - variant: QwenImageVariantType = Field(default=QwenImageVariantType.Generate) + variant: QwenImageVariantType | None = Field(default=None) @classmethod def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self: diff --git a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py index 15fcedba166..a025e727945 100644 --- a/invokeai/backend/model_manager/load/model_loaders/qwen_image.py +++ b/invokeai/backend/model_manager/load/model_loaders/qwen_image.py @@ -15,6 +15,7 @@ BaseModelType, ModelFormat, ModelType, + QwenImageVariantType, SubModelType, ) from invokeai.backend.quantization.gguf.ggml_tensor import GGMLTensor @@ -160,10 +161,13 @@ def _load_from_singlefile(self, config: AnyModelConfig) -> AnyModel: "axes_dims_rope": (16, 56, 56), } - # zero_cond_t was added in diffusers 0.37+; skip it on older versions + # zero_cond_t is only used by edit-variant models. It enables dual modulation + # for noisy vs reference patches. Setting it on txt2img models produces garbage. + # Also requires diffusers 0.37+ (the parameter doesn't exist in older versions). import inspect - if "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: + is_edit = getattr(config, "variant", None) == QwenImageVariantType.Edit + if is_edit and "zero_cond_t" in inspect.signature(QwenImageTransformer2DModel.__init__).parameters: model_config["zero_cond_t"] = True with accelerate.init_empty_weights(): diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index de5f1e1b8b6..ef7b25431a0 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -650,7 +650,7 @@ class StarterModelBundle(BaseModel): # endregion # region Qwen Image Edit -qwen_image = StarterModel( +qwen_image_edit = StarterModel( name="Qwen Image Edit 2511", base=BaseModelType.QwenImage, source="Qwen/Qwen-Image-Edit-2511", @@ -658,43 +658,43 @@ class StarterModelBundle(BaseModel): type=ModelType.Main, ) -qwen_image_gguf_q4_k_m = StarterModel( +qwen_image_edit_gguf_q4_k_m = StarterModel( name="Qwen Image Edit 2511 (Q4_K_M)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q4_K_M.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q4_K_M.gguf", description="Qwen Image Edit 2511 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q2_k = StarterModel( +qwen_image_edit_gguf_q2_k = StarterModel( name="Qwen Image Edit 2511 (Q2_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q2_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q2_K.gguf", description="Qwen Image Edit 2511 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q6_k = StarterModel( +qwen_image_edit_gguf_q6_k = StarterModel( name="Qwen Image Edit 2511 (Q6_K)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q6_K.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q6_K.gguf", description="Qwen Image Edit 2511 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_gguf_q8_0 = StarterModel( +qwen_image_edit_gguf_q8_0 = StarterModel( name="Qwen Image Edit 2511 (Q8_0)", base=BaseModelType.QwenImage, - source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-2511-Q8_0.gguf", + source="https://huggingface.co/unsloth/Qwen-Image-Edit-2511-GGUF/resolve/main/qwen-image-edit-2511-Q8_0.gguf", description="Qwen Image Edit 2511 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", type=ModelType.Main, format=ModelFormat.GGUFQuantized, ) -qwen_image_lightning_4step = StarterModel( +qwen_image_edit_lightning_4step = StarterModel( name="Qwen Image Edit Lightning (4-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors", @@ -703,7 +703,7 @@ class StarterModelBundle(BaseModel): type=ModelType.LoRA, ) -qwen_image_lightning_8step = StarterModel( +qwen_image_edit_lightning_8step = StarterModel( name="Qwen Image Edit Lightning (8-step, bf16)", base=BaseModelType.QwenImage, source="https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning/resolve/main/Qwen-Image-Edit-2511-Lightning-8steps-V1.0-bf16.safetensors", @@ -711,6 +711,69 @@ class StarterModelBundle(BaseModel): "Settings: Steps=8, CFG=1, Shift Override=3.", type=ModelType.LoRA, ) + +# Qwen Image (txt2img) +qwen_image = StarterModel( + name="Qwen Image 2512", + base=BaseModelType.QwenImage, + source="Qwen/Qwen-Image-2512", + description="Qwen Image 2512 full diffusers model. High-quality text-to-image generation. (~40GB)", + type=ModelType.Main, +) + +qwen_image_gguf_q4_k_m = StarterModel( + name="Qwen Image 2512 (Q4_K_M)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q4_K_M.gguf", + description="Qwen Image 2512 - Q4_K_M quantized transformer. Good quality/size balance. (~13GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q2_k = StarterModel( + name="Qwen Image 2512 (Q2_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q2_K.gguf", + description="Qwen Image 2512 - Q2_K heavily quantized transformer. Smallest size, lower quality. (~7.5GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q6_k = StarterModel( + name="Qwen Image 2512 (Q6_K)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q6_K.gguf", + description="Qwen Image 2512 - Q6_K quantized transformer. Near-lossless quality. (~17GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_gguf_q8_0 = StarterModel( + name="Qwen Image 2512 (Q8_0)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/unsloth/Qwen-Image-2512-GGUF/resolve/main/qwen-image-2512-Q8_0.gguf", + description="Qwen Image 2512 - Q8_0 quantized transformer. Highest quality quantization. (~22GB)", + type=ModelType.Main, + format=ModelFormat.GGUFQuantized, +) + +qwen_image_lightning_4step = StarterModel( + name="Qwen Image Lightning (4-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-4steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in just 4 steps. " + "Settings: Steps=4, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) + +qwen_image_lightning_8step = StarterModel( + name="Qwen Image Lightning (8-step, V2.0, bf16)", + base=BaseModelType.QwenImage, + source="https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V2.0-bf16.safetensors", + description="Lightning distillation LoRA for Qwen Image — enables generation in 8 steps with better quality. " + "Settings: Steps=8, CFG=1, Shift Override=3.", + type=ModelType.LoRA, +) # endregion # region SigLIP @@ -1012,6 +1075,13 @@ class StarterModelBundle(BaseModel): flux2_klein_qwen3_4b_encoder, flux2_klein_qwen3_8b_encoder, cogview4, + qwen_image_edit, + qwen_image_edit_gguf_q2_k, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q6_k, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q2_k, qwen_image_gguf_q4_k_m, @@ -1097,9 +1167,13 @@ class StarterModelBundle(BaseModel): ] qwen_image_bundle: list[StarterModel] = [ + qwen_image_edit, + qwen_image_edit_gguf_q4_k_m, + qwen_image_edit_gguf_q8_0, + qwen_image_edit_lightning_4step, + qwen_image_edit_lightning_8step, qwen_image, qwen_image_gguf_q4_k_m, - qwen_image_gguf_q8_0, qwen_image_lightning_4step, qwen_image_lightning_8step, ] diff --git a/invokeai/backend/model_manager/taxonomy.py b/invokeai/backend/model_manager/taxonomy.py index 9250310a29a..587c0b0625f 100644 --- a/invokeai/backend/model_manager/taxonomy.py +++ b/invokeai/backend/model_manager/taxonomy.py @@ -225,8 +225,28 @@ class FluxLoRAFormat(str, Enum): AnyVariant: TypeAlias = Union[ - ModelVariantType, ClipVariantType, FluxVariantType, Flux2VariantType, ZImageVariantType, QwenImageVariantType, Qwen3VariantType + ModelVariantType, + ClipVariantType, + FluxVariantType, + Flux2VariantType, + ZImageVariantType, + QwenImageVariantType, + Qwen3VariantType, ] variant_type_adapter = TypeAdapter[ - ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType -](ModelVariantType | ClipVariantType | FluxVariantType | Flux2VariantType | ZImageVariantType | QwenImageVariantType | Qwen3VariantType) + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +]( + ModelVariantType + | ClipVariantType + | FluxVariantType + | Flux2VariantType + | ZImageVariantType + | QwenImageVariantType + | Qwen3VariantType +) diff --git a/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py b/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py index 7488e0e72e3..df8aa2ef566 100644 --- a/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py +++ b/invokeai/backend/patches/lora_conversions/qwen_image_lora_conversion_utils.py @@ -1,9 +1,13 @@ -"""Qwen Image Edit LoRA conversion utilities. +"""Qwen Image LoRA conversion utilities. -Qwen Image Edit uses QwenImageTransformer2DModel architecture. -LoRAs follow the standard format with lora_down.weight/lora_up.weight/alpha keys. +Qwen Image uses QwenImageTransformer2DModel architecture. +Supports multiple LoRA formats: +- Diffusers/PEFT: transformer_blocks.0.attn.to_k.lora_down.weight +- With prefix: transformer.transformer_blocks.0.attn.to_k.lora_down.weight +- Kohya: lora_unet_transformer_blocks_0_attn_to_k.lora_down.weight (underscores instead of dots) """ +import re from typing import Dict import torch @@ -15,23 +19,117 @@ ) from invokeai.backend.patches.model_patch_raw import ModelPatchRaw +# Regex for Kohya-format Qwen Image LoRA keys. +# Example: lora_unet_transformer_blocks_0_attn_to_k +# Groups: (block_idx, sub_module_with_underscores) +_KOHYA_KEY_REGEX = re.compile(r"lora_unet_transformer_blocks_(\d+)_(.*)") + +# Mapping from Kohya underscore-separated sub-module names to dot-separated model paths. +# The Kohya format uses underscores everywhere, but some underscores are part of the +# module name (e.g., add_k_proj, to_out). We match the longest prefix first. +_KOHYA_MODULE_MAP: list[tuple[str, str]] = [ + # Attention projections + ("attn_add_k_proj", "attn.add_k_proj"), + ("attn_add_q_proj", "attn.add_q_proj"), + ("attn_add_v_proj", "attn.add_v_proj"), + ("attn_to_add_out", "attn.to_add_out"), + ("attn_to_out_0", "attn.to_out.0"), + ("attn_to_k", "attn.to_k"), + ("attn_to_q", "attn.to_q"), + ("attn_to_v", "attn.to_v"), + # Image stream MLP and modulation + ("img_mlp_net_0_proj", "img_mlp.net.0.proj"), + ("img_mlp_net_2", "img_mlp.net.2"), + ("img_mod_1", "img_mod.1"), + # Text stream MLP and modulation + ("txt_mlp_net_0_proj", "txt_mlp.net.0.proj"), + ("txt_mlp_net_2", "txt_mlp.net.2"), + ("txt_mod_1", "txt_mod.1"), +] + + +def is_state_dict_likely_kohya_qwen_image(state_dict: dict[str | int, torch.Tensor]) -> bool: + """Check if the state dict uses Kohya-format Qwen Image LoRA keys.""" + str_keys = [k for k in state_dict.keys() if isinstance(k, str)] + if not str_keys: + return False + # Check if any key matches the Kohya pattern + return any(k.startswith("lora_unet_transformer_blocks_") for k in str_keys) + + +def _convert_kohya_key(kohya_layer: str) -> str | None: + """Convert a Kohya-format layer name to a dot-separated model module path. + + Example: lora_unet_transformer_blocks_0_attn_to_k -> transformer_blocks.0.attn.to_k + """ + m = _KOHYA_KEY_REGEX.match(kohya_layer) + if not m: + return None + + block_idx = m.group(1) + sub_module = m.group(2) + + for kohya_name, model_path in _KOHYA_MODULE_MAP: + if sub_module == kohya_name: + return f"transformer_blocks.{block_idx}.{model_path}" + + # Fallback: unknown sub-module, return None so caller can warn/skip + return None + def lora_model_from_qwen_image_state_dict( state_dict: Dict[str, torch.Tensor], alpha: float | None = None ) -> ModelPatchRaw: - """Convert a Qwen Image Edit LoRA state dict to a ModelPatchRaw. - - The Lightning LoRA keys are in the format: - transformer_blocks.0.attn.to_k.lora_down.weight - transformer_blocks.0.attn.to_k.lora_up.weight - transformer_blocks.0.attn.to_k.alpha + """Convert a Qwen Image LoRA state dict to a ModelPatchRaw. - These are already the correct module paths for QwenImageTransformer2DModel. + Handles three key formats: + - Diffusers/PEFT: transformer_blocks.0.attn.to_k.lora_down.weight + - With prefix: transformer.transformer_blocks.0.attn.to_k.lora_down.weight + - Kohya: lora_unet_transformer_blocks_0_attn_to_k.lora_down.weight """ + is_kohya = is_state_dict_likely_kohya_qwen_image(state_dict) + + if is_kohya: + return _convert_kohya_format(state_dict, alpha) + else: + return _convert_diffusers_format(state_dict, alpha) + + +def _convert_kohya_format( + state_dict: Dict[str, torch.Tensor], alpha: float | None +) -> ModelPatchRaw: + """Convert Kohya-format state dict. Keys are like lora_unet_transformer_blocks_0_attn_to_k.lokr_w1""" + layers: dict[str, BaseLayerPatch] = {} + + # Group by layer (split at first dot: layer_name.param_name) + grouped: dict[str, dict[str, torch.Tensor]] = {} + for key, value in state_dict.items(): + if not isinstance(key, str): + continue + layer_name, param_name = key.split(".", 1) + if layer_name not in grouped: + grouped[layer_name] = {} + grouped[layer_name][param_name] = value + + for kohya_layer, layer_dict in grouped.items(): + model_path = _convert_kohya_key(kohya_layer) + if model_path is None: + continue # Skip unrecognized layers + + layer = any_lora_layer_from_state_dict(layer_dict) + final_key = f"{QWEN_IMAGE_EDIT_LORA_TRANSFORMER_PREFIX}{model_path}" + layers[final_key] = layer + + return ModelPatchRaw(layers=layers) + + +def _convert_diffusers_format( + state_dict: Dict[str, torch.Tensor], alpha: float | None +) -> ModelPatchRaw: + """Convert Diffusers/PEFT format state dict.""" layers: dict[str, BaseLayerPatch] = {} - # Some LoRAs use a "transformer." prefix on keys (e.g. "transformer.transformer_blocks.0.attn.to_k") - # while the model's module paths start at "transformer_blocks.0.attn.to_k". Strip it if present. + # Some LoRAs use a "transformer." prefix on keys strip_prefixes = ["transformer."] grouped = _group_by_layer(state_dict) diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index 408caecc982..e5121b1cfa5 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -1501,6 +1501,7 @@ "noFLUXVAEModelSelected": "No VAE model selected for FLUX generation", "noCLIPEmbedModelSelected": "No CLIP Embed model selected for FLUX generation", "noQwen3EncoderModelSelected": "No Qwen3 Encoder model selected for FLUX2 Klein generation", + "noQwenImageComponentSourceSelected": "GGUF Qwen Image models require a Diffusers Component Source for VAE/encoder", "noZImageVaeSourceSelected": "No VAE source: Select VAE (FLUX) or Qwen3 Source model", "noZImageQwen3EncoderSourceSelected": "No Qwen3 Encoder source: Select Qwen3 Encoder or Qwen3 Source model", "fluxModelIncompatibleBboxWidth": "$t(parameters.invoke.fluxRequiresDimensionsToBeMultipleOf16), bbox width is {{width}}", diff --git a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts index 3cd28b5f2a0..2027ff41741 100644 --- a/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts +++ b/invokeai/frontend/web/src/features/controlLayers/hooks/addLayerHooks.ts @@ -80,11 +80,7 @@ export const selectDefaultControlAdapter = createSelector( export const getDefaultRefImageConfig = ( getState: AppGetState -): - | IPAdapterConfig - | FluxKontextReferenceImageConfig - | Flux2ReferenceImageConfig - | QwenImageReferenceImageConfig => { +): IPAdapterConfig | FluxKontextReferenceImageConfig | Flux2ReferenceImageConfig | QwenImageReferenceImageConfig => { const state = getState(); const mainModelConfig = selectMainModelConfig(state); diff --git a/invokeai/frontend/web/src/features/metadata/parsing.tsx b/invokeai/frontend/web/src/features/metadata/parsing.tsx index 7d1d511a3c2..4f179d6b017 100644 --- a/invokeai/frontend/web/src/features/metadata/parsing.tsx +++ b/invokeai/frontend/web/src/features/metadata/parsing.tsx @@ -13,6 +13,9 @@ import { kleinVaeModelSelected, negativePromptChanged, positivePromptChanged, + qwenImageComponentSourceSelected, + qwenImageQuantizationChanged, + qwenImageShiftChanged, refinerModelChanged, selectBase, setCfgRescaleMultiplier, @@ -677,6 +680,83 @@ const ZImageSeedVarianceRandomizePercent: SingleMetadataHandler = { }; //#endregion ZImageSeedVarianceRandomizePercent +//#region QwenImageComponentSource +const QwenImageComponentSource: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageComponentSource', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_component_source'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + return Promise.resolve(zModelIdentifierField.parse(raw)); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageComponentSourceSelected(value)); + }, + i18nKey: 'modelManager.qwenImageComponentSource', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageComponentSource + +//#region QwenImageQuantization +const QwenImageQuantization: SingleMetadataHandler<'none' | 'int8' | 'nf4'> = { + [SingleMetadataKey]: true, + type: 'QwenImageQuantization', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_quantization'); + const parsed = z.enum(['none', 'int8', 'nf4']).parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve('none' as const); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageQuantizationChanged(value)); + }, + i18nKey: 'modelManager.qwenImageQuantization', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps<'none' | 'int8' | 'nf4'>) => ( + + ), +}; +//#endregion QwenImageQuantization + +//#region QwenImageShift +const QwenImageShift: SingleMetadataHandler = { + [SingleMetadataKey]: true, + type: 'QwenImageShift', + parse: (metadata, _store) => { + try { + const raw = getProperty(metadata, 'qwen_image_shift'); + if (raw === null || raw === undefined) { + return Promise.resolve(null); + } + const parsed = z.number().parse(raw); + return Promise.resolve(parsed); + } catch { + return Promise.resolve(null); + } + }, + recall: (value, store) => { + store.dispatch(qwenImageShiftChanged(value)); + }, + i18nKey: 'modelManager.qwenImageShift', + LabelComponent: MetadataLabel, + ValueComponent: ({ value }: SingleMetadataValueProps) => ( + + ), +}; +//#endregion QwenImageShift + //#region RefinerModel const RefinerModel: SingleMetadataHandler = { [SingleMetadataKey]: true, @@ -1233,6 +1313,9 @@ export const ImageMetadataHandlers = { ZImageSeedVarianceEnabled, ZImageSeedVarianceStrength, ZImageSeedVarianceRandomizePercent, + QwenImageComponentSource, + QwenImageQuantization, + QwenImageShift, LoRAs, CanvasLayers, RefImages, diff --git a/invokeai/frontend/web/src/features/nodes/types/common.ts b/invokeai/frontend/web/src/features/nodes/types/common.ts index ca1d42c5a44..10afd6e44bb 100644 --- a/invokeai/frontend/web/src/features/nodes/types/common.ts +++ b/invokeai/frontend/web/src/features/nodes/types/common.ts @@ -153,7 +153,7 @@ export const zModelVariantType = z.enum(['normal', 'inpaint', 'depth']); export const zFluxVariantType = z.enum(['dev', 'dev_fill', 'schnell']); export const zFlux2VariantType = z.enum(['klein_4b', 'klein_9b', 'klein_9b_base']); export const zZImageVariantType = z.enum(['turbo', 'zbase']); -export const zQwenImageVariantType = z.enum(['generate', 'edit']); +const zQwenImageVariantType = z.enum(['generate', 'edit']); export const zQwen3VariantType = z.enum(['qwen3_4b', 'qwen3_8b']); export const zAnyModelVariant = z.union([ zModelVariantType, diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts index e7c04744d4e..336766e5cea 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildQwenImageGraph.ts @@ -15,11 +15,7 @@ import { addQwenImageLoRAs } from 'features/nodes/util/graph/generation/addQwenI import { addTextToImage } from 'features/nodes/util/graph/generation/addTextToImage'; import { addWatermarker } from 'features/nodes/util/graph/generation/addWatermarker'; import { Graph } from 'features/nodes/util/graph/generation/Graph'; -import { - getOriginalAndScaledSizesForTextToImage, - selectCanvasOutputFields, - selectPresetModifiedPrompts, -} from 'features/nodes/util/graph/graphBuilderUtils'; +import { selectCanvasOutputFields, selectPresetModifiedPrompts } from 'features/nodes/util/graph/graphBuilderUtils'; import type { GraphBuilderArg, GraphBuilderReturn, ImageOutputNodes } from 'features/nodes/util/graph/types'; import { selectActiveTab } from 'features/ui/store/uiSelectors'; import type { Invocation } from 'services/api/types'; @@ -63,13 +59,16 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise 1) + const useCfg = typeof cfg_scale === 'number' ? cfg_scale > 1 : true; + const negCond = useCfg + ? g.addNode({ + type: 'qwen_image_text_encoder', + id: getPrefixedId('neg_prompt'), + prompt: prompts.negative || ' ', + quantization: params.qwenImageQuantization, + }) + : null; const seed = g.addNode({ id: getPrefixedId('seed'), @@ -89,13 +88,15 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise - entity.isEnabled && - isQwenImageReferenceImageConfig(entity.config) && - entity.config.image !== null && - getGlobalReferenceImageWarnings(entity, model).length === 0 - ); + // Only collect reference images for edit-variant models. + // For txt2img (generate) models, reference images are not used even if they exist in state. + const isEditModel = 'variant' in model && model.variant === 'edit'; + const validRefImageConfigs = isEditModel + ? selectRefImagesSlice(state).entities.filter( + (entity) => + entity.isEnabled && + isQwenImageReferenceImageConfig(entity.config) && + entity.config.image !== null && + getGlobalReferenceImageWarnings(entity, model).length === 0 + ) + : []; if (validRefImageConfigs.length > 0) { const refImgCollect = g.addNode({ @@ -135,14 +140,12 @@ export const buildQwenImageGraph = async (arg: GraphBuilderArg): Promise { if (!modelSupportsRefImages) { return false; } - if (modelConfig?.base === 'qwen-image' && 'variant' in modelConfig && modelConfig.variant !== 'edit') { - return false; + if (modelConfig?.base === 'qwen-image') { + const variant = 'variant' in modelConfig ? modelConfig.variant : null; + if (variant !== 'edit') { + return false; + } } return true; }, [modelSupportsRefImages, modelConfig]); diff --git a/invokeai/frontend/web/src/features/queue/store/readiness.ts b/invokeai/frontend/web/src/features/queue/store/readiness.ts index 6fc0376208f..3f5a46c6381 100644 --- a/invokeai/frontend/web/src/features/queue/store/readiness.ts +++ b/invokeai/frontend/web/src/features/queue/store/readiness.ts @@ -257,6 +257,12 @@ const getReasonsWhyCannotEnqueueGenerateTab = (arg: { // FLUX.2 (Klein) extracts Qwen3 encoder and VAE from main model - no separate selections needed + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { + if (!params.qwenImageComponentSource) { + reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); + } + } + if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; @@ -680,6 +686,12 @@ const getReasonsWhyCannotEnqueueCanvasTab = (arg: { } } + if (model?.base === 'qwen-image' && model.format === 'gguf_quantized') { + if (!params.qwenImageComponentSource) { + reasons.push({ content: i18n.t('parameters.invoke.noQwenImageComponentSourceSelected') }); + } + } + if (model?.base === 'z-image') { // Check if VAE source is available (either separate VAE or Qwen3 Source) const hasVaeSource = params.zImageVaeModel !== null || params.zImageQwen3SourceModel !== null; diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index a23217c3a81..b598719989a 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -18500,8 +18500,7 @@ export type components = { * @constant */ base: "qwen-image"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** Main_Diffusers_SD1_Config */ Main_Diffusers_SD1_Config: { @@ -19234,8 +19233,7 @@ export type components = { * @constant */ format: "gguf_quantized"; - /** @default generate */ - variant: components["schemas"]["QwenImageVariantType"]; + variant: components["schemas"]["QwenImageVariantType"] | null; }; /** * Main_GGUF_ZImage_Config @@ -22751,8 +22749,8 @@ export type components = { type: "qwen_image_conditioning_output"; }; /** - * Denoise - Qwen Image Edit - * @description Run the denoising process with a Qwen Image Edit model. + * Denoise - Qwen Image + * @description Run the denoising process with a Qwen Image model. */ QwenImageDenoiseInvocation: { /** @@ -22869,8 +22867,8 @@ export type components = { type: "qwen_image_denoise"; }; /** - * Image to Latents - Qwen Image Edit - * @description Generates latents from an image using the Qwen Image Edit VAE. + * Image to Latents - Qwen Image + * @description Generates latents from an image using the Qwen Image VAE. */ QwenImageImageToLatentsInvocation: { /** @@ -22930,8 +22928,8 @@ export type components = { type: "qwen_image_i2l"; }; /** - * Latents to Image - Qwen Image Edit - * @description Generates an image from latents using the Qwen Image Edit VAE. + * Latents to Image - Qwen Image + * @description Generates an image from latents using the Qwen Image VAE. */ QwenImageLatentsToImageInvocation: { /** @@ -22979,8 +22977,8 @@ export type components = { type: "qwen_image_l2i"; }; /** - * Apply LoRA Collection - Qwen Image Edit - * @description Applies a collection of LoRAs to a Qwen Image Edit transformer. + * Apply LoRA Collection - Qwen Image + * @description Applies a collection of LoRAs to a Qwen Image transformer. */ QwenImageLoRACollectionLoader: { /** @@ -23020,8 +23018,8 @@ export type components = { type: "qwen_image_lora_collection_loader"; }; /** - * Apply LoRA - Qwen Image Edit - * @description Apply a LoRA model to a Qwen Image Edit transformer. + * Apply LoRA - Qwen Image + * @description Apply a LoRA model to a Qwen Image transformer. */ QwenImageLoRALoaderInvocation: { /** @@ -23068,7 +23066,7 @@ export type components = { }; /** * QwenImageLoRALoaderOutput - * @description Qwen Image Edit LoRA Loader Output + * @description Qwen Image LoRA Loader Output */ QwenImageLoRALoaderOutput: { /** @@ -23085,8 +23083,8 @@ export type components = { type: "qwen_image_lora_loader_output"; }; /** - * Main Model - Qwen Image Edit - * @description Loads a Qwen Image Edit model, outputting its submodels. + * Main Model - Qwen Image + * @description Loads a Qwen Image model, outputting its submodels. * * The transformer is always loaded from the main model (Diffusers or GGUF). * @@ -23121,7 +23119,7 @@ export type components = { model: components["schemas"]["ModelIdentifierField"]; /** * Component Source (Diffusers) - * @description Diffusers Qwen Image Edit model to extract the VAE and Qwen VL encoder from. Required when using a GGUF quantized transformer. Ignored when the main model is already in Diffusers format. + * @description Diffusers Qwen Image model to extract the VAE and Qwen VL encoder from. Required when using a GGUF quantized transformer. Ignored when the main model is already in Diffusers format. * @default null */ component_source?: components["schemas"]["ModelIdentifierField"] | null; @@ -23134,7 +23132,7 @@ export type components = { }; /** * QwenImageModelLoaderOutput - * @description Qwen Image Edit base model loader output. + * @description Qwen Image model loader output. */ QwenImageModelLoaderOutput: { /** @@ -23160,8 +23158,8 @@ export type components = { type: "qwen_image_model_loader_output"; }; /** - * Prompt - Qwen Image Edit - * @description Encodes text and reference images for Qwen Image Edit using Qwen2.5-VL. + * Prompt - Qwen Image + * @description Encodes text and reference images for Qwen Image using Qwen2.5-VL. */ QwenImageTextEncoderInvocation: { /** diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index cfeb672d95e..b447f9debbe 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -330,10 +330,6 @@ export const isQwenImageDiffusersMainModelConfig = (config: AnyModelConfig): con return config.type === 'main' && config.base === 'qwen-image' && config.format === 'diffusers'; }; -export const isQwenImageEditMainModelConfig = (config: AnyModelConfig): config is MainModelConfig => { - return config.type === 'main' && config.base === 'qwen-image' && 'variant' in config && config.variant === 'edit'; -}; - export const isTIModelConfig = (config: AnyModelConfig): config is MainModelConfig => { return config.type === 'embedding'; }; diff --git a/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json new file mode 100644 index 00000000000..5a41ffed04c --- /dev/null +++ b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/__test_metadata__.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32053abc6257adf4771405fddfdaed2b91497c7cd7b0ea6af0aa29f9e008ca2f +size 233 diff --git a/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors new file mode 100644 index 00000000000..6e34832a719 --- /dev/null +++ b/tests/model_identification/stripped_models/f9f3c9fa-9449-4f90-996e-ea6be6b7d233/qwen_image_kohya_lokr_test.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b3d666baf329c922be86eacd12517cf734514da91377787d2f3cbd2b1a017c0 +size 2910