diff --git a/lib/crewai/src/crewai/crew.py b/lib/crewai/src/crewai/crew.py index 00fbae78fc5..5a00e1e1842 100644 --- a/lib/crewai/src/crewai/crew.py +++ b/lib/crewai/src/crewai/crew.py @@ -1338,7 +1338,18 @@ def _prepare_tools( api = getattr(agent.llm, "api", None) supported_types = get_supported_content_types(provider, api) + # Text files are always auto-injected (inlined as text), even + # when the model does not support multimodal input. + text_prefixes = ( + "text/", + "application/json", + "application/xml", + "application/x-yaml", + ) + def is_auto_injected(content_type: str) -> bool: + if any(content_type.startswith(t) for t in text_prefixes): + return True return any(content_type.startswith(t) for t in supported_types) # Only add read_file tool if there are files that need it diff --git a/lib/crewai/src/crewai/llm.py b/lib/crewai/src/crewai/llm.py index 75b1f654689..ee333f5cb27 100644 --- a/lib/crewai/src/crewai/llm.py +++ b/lib/crewai/src/crewai/llm.py @@ -54,7 +54,10 @@ try: - from crewai_files import aformat_multimodal_content, format_multimodal_content + from crewai_files import ( + aformat_multimodal_content, + format_multimodal_content, + ) HAS_CREWAI_FILES = True except ImportError: @@ -2039,6 +2042,10 @@ def _process_message_files(self, messages: list[LLMMessage]) -> list[LLMMessage] For each message with a `files` field, formats the files into provider-specific content blocks and updates the message content. + Text files (TextFile instances or files with text/* / application/json / + application/xml / application/x-yaml content types) are always inlined + as text content, even when the model does not support multimodal input. + Args: messages: List of messages that may contain file attachments. @@ -2049,11 +2056,55 @@ def _process_message_files(self, messages: list[LLMMessage]) -> list[LLMMessage] return messages if not self.supports_multimodal(): - if any(msg.get("files") for msg in messages): + # Inline text files as text; reject non-text files + has_non_text = False + for msg in messages: + files = msg.get("files") + if not files: + continue + + text_parts: list[str] = [] + non_text_files: dict[str, Any] = {} + for name, file_input in files.items(): + if self._is_text_file(file_input): + try: + content = file_input.read_text() + text_parts.append( + f"--- Content of file '{name}' ---\n{content}" + ) + except Exception: + non_text_files[name] = file_input + else: + non_text_files[name] = file_input + + if non_text_files: + has_non_text = True + + if text_parts: + existing_content = msg.get("content", "") + inlined = "\n\n".join(text_parts) + if isinstance(existing_content, str): + msg["content"] = ( + f"{existing_content}\n\n{inlined}" + if existing_content + else inlined + ) + elif isinstance(existing_content, list): + msg["content"] = [ + *existing_content, + self.format_text_content(inlined), + ] + + if non_text_files: + msg["files"] = non_text_files + else: + msg.pop("files", None) + + if has_non_text: raise ValueError( f"Model '{self.model}' does not support multimodal input, " - "but files were provided via 'input_files'. " - "Use a vision-capable model or remove the file inputs." + "but non-text files were provided via 'input_files'. " + "Use a vision-capable model or remove the non-text file inputs." ) return messages @@ -2090,6 +2141,10 @@ async def _aprocess_message_files( For each message with a `files` field, formats the files into provider-specific content blocks and updates the message content. + Text files (TextFile instances or files with text/* / application/json / + application/xml / application/x-yaml content types) are always inlined + as text content, even when the model does not support multimodal input. + Args: messages: List of messages that may contain file attachments. @@ -2100,11 +2155,55 @@ async def _aprocess_message_files( return messages if not self.supports_multimodal(): - if any(msg.get("files") for msg in messages): + # Inline text files as text; reject non-text files + has_non_text = False + for msg in messages: + files = msg.get("files") + if not files: + continue + + text_parts: list[str] = [] + non_text_files: dict[str, Any] = {} + for name, file_input in files.items(): + if self._is_text_file(file_input): + try: + content = file_input.read_text() + text_parts.append( + f"--- Content of file '{name}' ---\n{content}" + ) + except Exception: + non_text_files[name] = file_input + else: + non_text_files[name] = file_input + + if non_text_files: + has_non_text = True + + if text_parts: + existing_content = msg.get("content", "") + inlined = "\n\n".join(text_parts) + if isinstance(existing_content, str): + msg["content"] = ( + f"{existing_content}\n\n{inlined}" + if existing_content + else inlined + ) + elif isinstance(existing_content, list): + msg["content"] = [ + *existing_content, + self.format_text_content(inlined), + ] + + if non_text_files: + msg["files"] = non_text_files + else: + msg.pop("files", None) + + if has_non_text: raise ValueError( f"Model '{self.model}' does not support multimodal input, " - "but files were provided via 'input_files'. " - "Use a vision-capable model or remove the file inputs." + "but non-text files were provided via 'input_files'. " + "Use a vision-capable model or remove the non-text file inputs." ) return messages diff --git a/lib/crewai/src/crewai/llms/base_llm.py b/lib/crewai/src/crewai/llms/base_llm.py index 6e81271e184..7a361191f65 100644 --- a/lib/crewai/src/crewai/llms/base_llm.py +++ b/lib/crewai/src/crewai/llms/base_llm.py @@ -37,7 +37,7 @@ try: - from crewai_files import format_multimodal_content + from crewai_files import TextFile, format_multimodal_content HAS_CREWAI_FILES = True except ImportError: @@ -635,6 +635,10 @@ def _process_message_files(self, messages: list[LLMMessage]) -> list[LLMMessage] For each message with a `files` field, formats the files into provider-specific content blocks and updates the message content. + Text files (TextFile instances or files with text/* / application/json / + application/xml / application/x-yaml content types) are always inlined + as text content, even when the model does not support multimodal input. + Args: messages: List of messages that may contain file attachments. @@ -644,12 +648,61 @@ def _process_message_files(self, messages: list[LLMMessage]) -> list[LLMMessage] if not HAS_CREWAI_FILES: return messages - if not self.supports_multimodal(): - if any(msg.get("files") for msg in messages): + is_multimodal = self.supports_multimodal() + + if not is_multimodal: + # Inline text files as text; reject non-text files + has_non_text = False + for msg in messages: + files = msg.get("files") + if not files: + continue + + text_parts: list[str] = [] + non_text_files: dict[str, Any] = {} + for name, file_input in files.items(): + if self._is_text_file(file_input): + try: + content = file_input.read_text() + text_parts.append( + f"--- Content of file '{name}' ---\n{content}" + ) + except Exception: + # If reading fails, fall back to tool-based access + non_text_files[name] = file_input + else: + non_text_files[name] = file_input + + if non_text_files: + has_non_text = True + + # Append inlined text content to the message + if text_parts: + existing_content = msg.get("content", "") + inlined = "\n\n".join(text_parts) + if isinstance(existing_content, str): + msg["content"] = ( + f"{existing_content}\n\n{inlined}" + if existing_content + else inlined + ) + elif isinstance(existing_content, list): + msg["content"] = [ + *existing_content, + self.format_text_content(inlined), + ] + + # Keep only non-text files (for tool-based access) + if non_text_files: + msg["files"] = non_text_files + else: + msg.pop("files", None) + + if has_non_text: raise ValueError( f"Model '{self.model}' does not support multimodal input, " - "but files were provided via 'input_files'. " - "Use a vision-capable model or remove the file inputs." + "but non-text files were provided via 'input_files'. " + "Use a vision-capable model or remove the non-text file inputs." ) return messages @@ -680,6 +733,25 @@ def _process_message_files(self, messages: list[LLMMessage]) -> list[LLMMessage] return messages + @staticmethod + def _is_text_file(file_input: Any) -> bool: + """Check whether a file input is a text file. + + Returns True for TextFile instances or files whose content_type + starts with ``text/`` or matches common text-based MIME types + (application/json, application/xml, application/x-yaml). + """ + if HAS_CREWAI_FILES and isinstance(file_input, TextFile): + return True + content_type = getattr(file_input, "content_type", "") + if content_type.startswith("text/"): + return True + return content_type in ( + "application/json", + "application/xml", + "application/x-yaml", + ) + @staticmethod def _validate_structured_output( response: str, diff --git a/lib/crewai/src/crewai/task.py b/lib/crewai/src/crewai/task.py index 38860352b03..a13982e2e69 100644 --- a/lib/crewai/src/crewai/task.py +++ b/lib/crewai/src/crewai/task.py @@ -824,7 +824,18 @@ def prompt(self) -> str: api: str | None = getattr(self.agent.llm, "api", None) supported_types = get_supported_content_types(provider, api) + # Text files are always auto-injected (inlined as text), even + # when the model does not support multimodal input. + text_prefixes = ( + "text/", + "application/json", + "application/xml", + "application/x-yaml", + ) + def is_auto_injected(content_type: str) -> bool: + if any(content_type.startswith(t) for t in text_prefixes): + return True return any(content_type.startswith(t) for t in supported_types) auto_injected_files = { diff --git a/lib/crewai/tests/llms/test_multimodal.py b/lib/crewai/tests/llms/test_multimodal.py index cde9e13d377..11e8efad331 100644 --- a/lib/crewai/tests/llms/test_multimodal.py +++ b/lib/crewai/tests/llms/test_multimodal.py @@ -338,6 +338,280 @@ def call(self, messages, tools=None, callbacks=None): assert result == {"type": "text", "text": "Hello"} +class TestTextFileInliningNonMultimodal: + """Tests for text file inlining on non-multimodal models (issue #5137). + + When a model does not support multimodal input, text files should be + inlined as plain text in the message content rather than raising a + ValueError. + """ + + # --- BaseLLM (native provider path) --- + + def test_base_text_file_inlined_on_non_multimodal(self) -> None: + """TextFile content is inlined when model is not multimodal (BaseLLM).""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + assert llm.supports_multimodal() is False + + text_content = b"Hello from a text file!" + messages = [ + { + "role": "user", + "content": "Analyse this file", + "files": {"readme": TextFile(source=text_content)}, + } + ] + + result = llm._process_message_files(messages) + + assert "files" not in result[0] + assert "Hello from a text file!" in result[0]["content"] + assert "readme" in result[0]["content"] + + def test_base_multiple_text_files_inlined(self) -> None: + """Multiple text files are all inlined on non-multimodal model.""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + + messages = [ + { + "role": "user", + "content": "Analyse these files", + "files": { + "file1": TextFile(source=b"Content of file 1"), + "file2": TextFile(source=b"Content of file 2"), + }, + } + ] + + result = llm._process_message_files(messages) + + assert "files" not in result[0] + assert "Content of file 1" in result[0]["content"] + assert "Content of file 2" in result[0]["content"] + + def test_base_image_file_still_rejected_on_non_multimodal(self) -> None: + """ImageFile still raises ValueError on non-multimodal model.""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + + messages = [ + { + "role": "user", + "content": "Describe this image", + "files": {"photo": ImageFile(source=MINIMAL_PNG)}, + } + ] + + with pytest.raises(ValueError, match="non-text files"): + llm._process_message_files(messages) + + def test_base_mixed_text_and_image_rejects_but_inlines_text(self) -> None: + """Mixed text+image: text is inlined, but error is raised for image.""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + + messages = [ + { + "role": "user", + "content": "Process these", + "files": { + "readme": TextFile(source=b"Some text content"), + "photo": ImageFile(source=MINIMAL_PNG), + }, + } + ] + + with pytest.raises(ValueError, match="non-text files"): + llm._process_message_files(messages) + + # Text file should have been inlined before the error + assert "Some text content" in messages[0]["content"] + + def test_base_no_files_no_error(self) -> None: + """Messages without files pass through unchanged.""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + + messages = [ + {"role": "user", "content": "No files here"}, + ] + + result = llm._process_message_files(messages) + assert result[0]["content"] == "No files here" + + def test_base_text_file_with_empty_existing_content(self) -> None: + """TextFile inlined when existing content is empty string.""" + from crewai.llms.base_llm import BaseLLM + + class NonMultimodalLLM(BaseLLM): + def call(self, messages, tools=None, callbacks=None): + return "test" + + llm = NonMultimodalLLM(model="test-model") + + messages = [ + { + "role": "user", + "content": "", + "files": {"doc": TextFile(source=b"File content here")}, + } + ] + + result = llm._process_message_files(messages) + + assert "files" not in result[0] + assert "File content here" in result[0]["content"] + # Should not start with newlines when existing content is empty + assert not result[0]["content"].startswith("\n") + + # --- LiteLLM LLM class --- + + def test_litellm_text_file_inlined_on_non_multimodal(self) -> None: + """TextFile content is inlined when litellm model is not multimodal.""" + llm = LLM(model="gpt-3.5-turbo", is_litellm=True) + assert llm.supports_multimodal() is False + + messages = [ + { + "role": "user", + "content": "Analyse this file", + "files": {"readme": TextFile(source=b"Hello from litellm test")}, + } + ] + + result = llm._process_message_files(messages) + + assert "files" not in result[0] + assert "Hello from litellm test" in result[0]["content"] + + def test_litellm_image_file_rejected_on_non_multimodal(self) -> None: + """ImageFile raises ValueError on non-multimodal litellm model.""" + llm = LLM(model="gpt-3.5-turbo", is_litellm=True) + assert llm.supports_multimodal() is False + + messages = [ + { + "role": "user", + "content": "Describe this", + "files": {"photo": ImageFile(source=MINIMAL_PNG)}, + } + ] + + with pytest.raises(ValueError, match="non-text files"): + llm._process_message_files(messages) + + def test_litellm_json_file_inlined_on_non_multimodal(self) -> None: + """JSON file (application/json) is treated as text and inlined.""" + llm = LLM(model="gpt-3.5-turbo", is_litellm=True) + assert llm.supports_multimodal() is False + + json_content = b'{"key": "value"}' + messages = [ + { + "role": "user", + "content": "Parse this JSON", + "files": {"data": TextFile(source=json_content)}, + } + ] + + result = llm._process_message_files(messages) + + assert "files" not in result[0] + assert '{"key": "value"}' in result[0]["content"] + + # --- _is_text_file helper --- + + def test_is_text_file_with_text_file_instance(self) -> None: + """_is_text_file returns True for TextFile instances.""" + from crewai.llms.base_llm import BaseLLM + + assert BaseLLM._is_text_file(TextFile(source=b"hello")) is True + + def test_is_text_file_with_image_file_instance(self) -> None: + """_is_text_file returns False for ImageFile instances.""" + from crewai.llms.base_llm import BaseLLM + + assert BaseLLM._is_text_file(ImageFile(source=MINIMAL_PNG)) is False + + def test_is_text_file_with_pdf_file_instance(self) -> None: + """_is_text_file returns False for PDFFile instances.""" + from crewai.llms.base_llm import BaseLLM + + assert BaseLLM._is_text_file(PDFFile(source=MINIMAL_PDF)) is False + + def test_is_text_file_with_text_content_type(self) -> None: + """_is_text_file returns True for objects with text/* content_type.""" + from crewai.llms.base_llm import BaseLLM + + class MockFile: + content_type = "text/plain" + + assert BaseLLM._is_text_file(MockFile()) is True + + def test_is_text_file_with_json_content_type(self) -> None: + """_is_text_file returns True for application/json content_type.""" + from crewai.llms.base_llm import BaseLLM + + class MockFile: + content_type = "application/json" + + assert BaseLLM._is_text_file(MockFile()) is True + + def test_is_text_file_with_xml_content_type(self) -> None: + """_is_text_file returns True for application/xml content_type.""" + from crewai.llms.base_llm import BaseLLM + + class MockFile: + content_type = "application/xml" + + assert BaseLLM._is_text_file(MockFile()) is True + + def test_is_text_file_with_yaml_content_type(self) -> None: + """_is_text_file returns True for application/x-yaml content_type.""" + from crewai.llms.base_llm import BaseLLM + + class MockFile: + content_type = "application/x-yaml" + + assert BaseLLM._is_text_file(MockFile()) is True + + def test_is_text_file_with_image_content_type(self) -> None: + """_is_text_file returns False for image/* content_type.""" + from crewai.llms.base_llm import BaseLLM + + class MockFile: + content_type = "image/png" + + assert BaseLLM._is_text_file(MockFile()) is False + + class TestMultipleFilesFormatting: """Tests for formatting multiple files at once.""" @@ -372,4 +646,4 @@ def test_format_empty_files_dict(self) -> None: result = format_multimodal_content({}, llm.model) - assert result == [] \ No newline at end of file + assert result == []