diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 0909be33d..ab99a793c 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -566,6 +566,8 @@ def __init__(
min_word_threshold: int = None,
threshold_type: str = "fixed",
threshold: float = 0.48,
+ preserve_classes: list = None,
+ preserve_tags: list = None,
):
"""
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
@@ -578,11 +580,15 @@ def __init__(
min_word_threshold (int): Minimum word threshold for filtering (optional).
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
threshold (float): Fixed threshold value (default: 0.48).
+ preserve_classes (list): CSS class names to always keep regardless of score (optional).
+ preserve_tags (list): HTML tag names to always keep regardless of score (optional).
"""
super().__init__(None)
self.min_word_threshold = min_word_threshold
self.threshold_type = threshold_type
self.threshold = threshold
+ self.preserve_classes = set(preserve_classes) if preserve_classes else set()
+ self.preserve_tags = set(preserve_tags) if preserve_tags else set()
# Add tag importance for dynamic threshold
self.tag_importance = {
@@ -682,6 +688,16 @@ def _remove_unwanted_tags(self, soup):
for element in soup.find_all(tag):
element.decompose()
+ def _is_preserved(self, node):
+ """Check if a node matches the preserve whitelist."""
+ if self.preserve_tags and node.name in self.preserve_tags:
+ return True
+ if self.preserve_classes and "class" in getattr(node, "attrs", {}):
+ node_classes = set(node["class"]) if isinstance(node["class"], list) else {node["class"]}
+ if node_classes & self.preserve_classes:
+ return True
+ return False
+
def _prune_tree(self, node):
"""
Prunes the tree starting from the given node.
@@ -692,6 +708,10 @@ def _prune_tree(self, node):
if not node or not hasattr(node, "name") or node.name is None:
return
+ # Skip pruning for preserved nodes — always keep them
+ if self._is_preserved(node):
+ return
+
text_len = len(node.get_text(strip=True))
tag_len = len(node.encode_contents().decode("utf-8"))
link_text_len = sum(
diff --git a/tests/test_pruning_preserve_whitelist_1900.py b/tests/test_pruning_preserve_whitelist_1900.py
new file mode 100644
index 000000000..01a46817b
--- /dev/null
+++ b/tests/test_pruning_preserve_whitelist_1900.py
@@ -0,0 +1,258 @@
+"""
+Tests for #1900: PruningContentFilter preserve_classes and preserve_tags.
+
+Verifies that whitelisted classes/tags are always kept regardless of
+pruning score, while non-whitelisted content is still pruned normally.
+"""
+import pytest
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+
+# ── HTML fixtures ────────────────────────────────────────────────────────
+
+GITHUB_COMMENT_HTML = """
+
+
+
Discussion: Feature Request
+
This is a long paragraph about the feature request with enough words to
+ pass the pruning threshold easily. The feature would add support for document
+ extraction in the crawl pipeline, enabling binary documents like PDFs and
+ DOCX files to be processed alongside HTML pages.
I think this is a great idea. We should implement it using a
+ pluggable strategy pattern so users can bring their own extraction
+ backend. This would keep the core library lean while supporting
+ many document types.
Agreed with alice. The abstract base class approach makes sense.
+ We could also add a built-in implementation for PDFs since crawl4ai
+ already has PDFContentScrapingStrategy that could be wrapped.
+
+
+
+
+
+
+
+"""
+
+ATTRIBUTION_HTML = """
+
+
+
Long article content that should definitely pass the threshold because it
+ contains enough words and text density to score well in the pruning algorithm.
+ This paragraph discusses the implementation details of the feature.
+
By Jane Smith
+
Jane is a senior engineer at Example Corp.
+
+
+"""
+
+SIMPLE_HTML = """
+
+
+
Main content paragraph with enough text to pass pruning easily. This
+ discusses important topics that should be preserved in the output.
+ Source: Example Research Paper, 2026
+
+
+"""
+
+
+# ── Default behavior (no whitelist) ──────────────────────────────────────
+
+class TestDefaultBehavior:
+
+ def test_default_no_preserve(self):
+ """Without whitelist, default pruning behavior is unchanged."""
+ f = PruningContentFilter()
+ assert f.preserve_classes == set()
+ assert f.preserve_tags == set()
+
+ def test_main_content_kept(self):
+ """Long paragraphs should still be kept."""
+ f = PruningContentFilter()
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "feature request" in combined.lower()
+
+ def test_nav_footer_still_removed(self):
+ """Nav and footer should still be removed even with whitelist on other things."""
+ f = PruningContentFilter(preserve_classes=["author"])
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "site-nav" not in combined
+ assert "Copyright 2026" not in combined
+
+
+# ── preserve_classes ─────────────────────────────────────────────────────
+
+class TestPreserveClasses:
+
+ def test_preserve_author_class(self):
+ """Elements with 'author' class should be kept when whitelisted."""
+ f = PruningContentFilter(preserve_classes=["author"])
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "alice" in combined
+ assert "bob" in combined
+
+ def test_without_preserve_author_may_be_stripped(self):
+ """Without whitelist, short author spans may be stripped."""
+ f = PruningContentFilter()
+ result = f.filter_content(ATTRIBUTION_HTML)
+ combined = " ".join(result)
+ # The main content should be there
+ assert "article content" in combined.lower()
+ # byline might be stripped (short, low density)
+
+ def test_preserve_byline(self):
+ """Preserving 'byline' class keeps attribution."""
+ f = PruningContentFilter(preserve_classes=["byline"])
+ result = f.filter_content(ATTRIBUTION_HTML)
+ combined = " ".join(result)
+ assert "Jane Smith" in combined
+
+ def test_preserve_multiple_classes(self):
+ """Multiple classes can be preserved."""
+ f = PruningContentFilter(
+ preserve_classes=["author", "comment-header", "byline"]
+ )
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "alice" in combined
+ assert "bob" in combined
+
+ def test_preserve_class_not_in_html(self):
+ """Preserving a class that doesn't exist in the HTML is harmless."""
+ f = PruningContentFilter(preserve_classes=["nonexistent-class"])
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ # Should work normally, no crash
+ assert len(result) > 0
+
+ def test_empty_preserve_classes(self):
+ """Empty list should behave like no whitelist."""
+ f = PruningContentFilter(preserve_classes=[])
+ assert f.preserve_classes == set()
+
+
+# ── preserve_tags ────────────────────────────────────────────────────────
+
+class TestPreserveTags:
+
+ def test_preserve_cite_tag(self):
+ """Preserving 'cite' tag keeps source attribution."""
+ f = PruningContentFilter(preserve_tags=["cite"])
+ result = f.filter_content(SIMPLE_HTML)
+ combined = " ".join(result)
+ assert "Example Research Paper" in combined
+
+ def test_preserve_time_tag(self):
+ """Preserving 'time' tag keeps timestamps."""
+ f = PruningContentFilter(preserve_tags=["time"])
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "Apr 6, 2026" in combined
+
+ def test_preserve_multiple_tags(self):
+ """Multiple tags can be preserved."""
+ f = PruningContentFilter(preserve_tags=["cite", "time"])
+ result = f.filter_content(SIMPLE_HTML)
+ combined = " ".join(result)
+ assert "Example Research Paper" in combined
+
+ def test_empty_preserve_tags(self):
+ """Empty list should behave like no whitelist."""
+ f = PruningContentFilter(preserve_tags=[])
+ assert f.preserve_tags == set()
+
+
+# ── Combined ─────────────────────────────────────────────────────────────
+
+class TestCombined:
+
+ def test_both_classes_and_tags(self):
+ """Both preserve_classes and preserve_tags work together."""
+ f = PruningContentFilter(
+ preserve_classes=["author"],
+ preserve_tags=["time"],
+ )
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ assert "alice" in combined
+ assert "Apr 6, 2026" in combined
+
+ def test_whitelist_does_not_override_excluded_tags(self):
+ """Nav/footer/header are removed before pruning — whitelist can't save them."""
+ f = PruningContentFilter(preserve_tags=["nav"])
+ result = f.filter_content(GITHUB_COMMENT_HTML)
+ combined = " ".join(result)
+ # nav is in excluded_tags and removed before pruning runs
+ # preserve_tags only affects the pruning phase
+ # This is expected — excluded_tags are structural boilerplate
+
+
+# ── _is_preserved method ─────────────────────────────────────────────────
+
+class TestIsPreserved:
+
+ def test_is_preserved_by_class(self):
+ from bs4 import BeautifulSoup
+ f = PruningContentFilter(preserve_classes=["author"])
+ soup = BeautifulSoup('Alice', "html.parser")
+ node = soup.find("span")
+ assert f._is_preserved(node) is True
+
+ def test_not_preserved_without_match(self):
+ from bs4 import BeautifulSoup
+ f = PruningContentFilter(preserve_classes=["author"])
+ soup = BeautifulSoup('2026', "html.parser")
+ node = soup.find("span")
+ assert f._is_preserved(node) is False
+
+ def test_is_preserved_by_tag(self):
+ from bs4 import BeautifulSoup
+ f = PruningContentFilter(preserve_tags=["cite"])
+ soup = BeautifulSoup('Source', "html.parser")
+ node = soup.find("cite")
+ assert f._is_preserved(node) is True
+
+ def test_not_preserved_empty_whitelist(self):
+ from bs4 import BeautifulSoup
+ f = PruningContentFilter()
+ soup = BeautifulSoup('Alice', "html.parser")
+ node = soup.find("span")
+ assert f._is_preserved(node) is False
+
+
+# ── Serialization (for Docker API) ───────────────────────────────────────
+
+class TestSerialization:
+
+ def test_params_stored(self):
+ """preserve_classes and preserve_tags should be stored as attributes."""
+ f = PruningContentFilter(
+ preserve_classes=["author", "byline"],
+ preserve_tags=["time", "cite"],
+ )
+ assert f.preserve_classes == {"author", "byline"}
+ assert f.preserve_tags == {"time", "cite"}
I think this is a great idea. We should implement it using a + pluggable strategy pattern so users can bring their own extraction + backend. This would keep the core library lean while supporting + many document types.
+