From 953b91e8b7bdb42e64758dc3fe98f464ebb9c5b2 Mon Sep 17 00:00:00 2001
From: February71st <lbowen@g.hmc.edu>
Date: Tue, 8 Apr 2025 10:21:38 -0700
Subject: [PATCH 1/5] Edited preprocessing to fix an error where a mismatch in
 tokenisation caused punctuation and tokens attached to it would be removed
 regardless of settings

---
 octis/preprocessing/preprocessing.py | 21 +++++++++++++++------
 tests/test_datasets.py               | 26 +++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index 64ede3ee..d730dac1 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -1,6 +1,6 @@
 import string
 from typing import List, Union
-
+import re
 import spacy
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
@@ -9,7 +9,7 @@
 from pathlib import Path
 from octis.dataset.dataset import Dataset
 from collections import Counter
-
+print("Using the correct version")
 """
 Maps the language to its corresponding spacy model
 """
@@ -156,11 +156,13 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         :return octis.dataset.dataset.Dataset
         """
         docs = [line.strip() for line in open(documents_path, 'r').readlines()]
+        if self.lowercase:
+            docs = [d.lower() for d in docs]
         if self.num_processes is not None:
             # with Pool(self.num_processes) as p:
             #    docs = p.map(self.simple_preprocessing_steps, docs)
             chunksize = max(1, len(docs) // (self.num_processes * 20))
-            docs_list = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=chunksize)
+            docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=chunksize)
         else:
             docs = list(map(self.simple_preprocessing_steps, tqdm(docs)))
         if self.lowercase:
@@ -174,6 +176,8 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         print("created vocab")
         print(len(vocabulary))
         final_docs, final_labels, document_indexes = [], [], []
+
+
         if labels_path is not None:
             if multilabel:
                 labels = [
@@ -183,10 +187,15 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
                 labels = [
                     line.strip()
                     for line in open(labels_path, 'r').readlines()]
-
+            
             vocab = set(vocabulary)
+            
+
             for i, doc, label in zip(range(len(docs)), docs, labels):
-                new_doc = [w for w in doc.split() if w in vocab]
+
+                
+                new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))]
+                
                 if len(new_doc) > self.min_doc_words:
                     final_docs.append(new_doc)
                     final_labels.append(label)
@@ -206,7 +215,7 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         else:
             vocab = set(vocabulary)
             for i, doc in enumerate(docs):
-                new_doc = [w for w in doc.split() if w in vocab]
+                new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))]
                 if len(new_doc) > self.min_doc_words:
                     final_docs.append(new_doc)
                     document_indexes.append(i)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index e451f8f6..bf033025 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -54,7 +54,7 @@ def test_preprocessing_english_stops_split(data_dir):
 def test_preprocessing_multiprocess(data_dir):
     texts_path = data_dir+"/sample_texts/unprepr_docs.txt"
     p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True,
-                      lemmatize=False,  num_processes=10, split=False,
+                      lemmatize=False, split=False,
                       min_chars=2, min_words_docs=1)
     dataset = p.preprocess_dataset(
         documents_path=texts_path,
@@ -64,6 +64,30 @@ def test_preprocessing_multiprocess(data_dir):
     dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts")
 
 
+def test_preprocessing_nothing(data_dir):
+    texts_path = data_dir+"/sample_texts/unprepr_docs.txt"
+    p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=False,
+                      remove_numbers = False,
+                      lemmatize=False, split=False,
+                      min_chars=1, min_words_docs=0)
+    
+    unprocessed = [d.strip() for d in open(texts_path, "r").readlines() if len(d.strip()) > 0]
+    lens = [len(d.split()) for d in unprocessed]
+
+    dataset = p.preprocess_dataset(
+        documents_path=texts_path,
+    )
+    print(dataset.get_corpus())
+    lens_pros = [len(d) for d in dataset.get_corpus()]
+    print(list(zip(lens,lens_pros)))
+    assert len(lens) == len(lens_pros)
+    for i in range(len(lens_pros)):
+        assert lens[i] == lens_pros[i]
+
+    dataset.save(data_dir+"/sample_texts/")
+    dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts")
+
+
 def test_load_20ng():
     data_home = get_data_home(data_home=None)
     cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz")

From 2c825010d568d4a1dc9182b937f8f914d9e99b91 Mon Sep 17 00:00:00 2001
From: February71st <lbowen@g.hmc.edu>
Date: Tue, 15 Apr 2025 00:30:41 -0700
Subject: [PATCH 2/5] Cleaned up my previous code in preprocessing

---
 octis/preprocessing/preprocessing.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py
index d730dac1..1c7c6caf 100644
--- a/octis/preprocessing/preprocessing.py
+++ b/octis/preprocessing/preprocessing.py
@@ -1,6 +1,6 @@
+import re
 import string
 from typing import List, Union
-import re
 import spacy
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
@@ -9,7 +9,6 @@
 from pathlib import Path
 from octis.dataset.dataset import Dataset
 from collections import Counter
-print("Using the correct version")
 """
 Maps the language to its corresponding spacy model
 """
@@ -156,8 +155,6 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         :return octis.dataset.dataset.Dataset
         """
         docs = [line.strip() for line in open(documents_path, 'r').readlines()]
-        if self.lowercase:
-            docs = [d.lower() for d in docs]
         if self.num_processes is not None:
             # with Pool(self.num_processes) as p:
             #    docs = p.map(self.simple_preprocessing_steps, docs)
@@ -177,6 +174,10 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         print(len(vocabulary))
         final_docs, final_labels, document_indexes = [], [], []
 
+        def valid_word_or_punc(word):
+            valid_word = len([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", word) if rw in vocab]) > 0
+            all_punc = len(word) == len(re.findall(r'[^\w]',word))
+            return valid_word or all_punc
 
         if labels_path is not None:
             if multilabel:
@@ -187,15 +188,11 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
                 labels = [
                     line.strip()
                     for line in open(labels_path, 'r').readlines()]
-            
-            vocab = set(vocabulary)
-            
 
+            vocab = set(vocabulary)
             for i, doc, label in zip(range(len(docs)), docs, labels):
+                new_doc = [w for w in doc.split() if valid_word_or_punc(w)]
 
-                
-                new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))]
-                
                 if len(new_doc) > self.min_doc_words:
                     final_docs.append(new_doc)
                     final_labels.append(label)
@@ -215,7 +212,7 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False)
         else:
             vocab = set(vocabulary)
             for i, doc in enumerate(docs):
-                new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))]
+                new_doc = [w for w in doc.split() if valid_word_or_punc(w)]
                 if len(new_doc) > self.min_doc_words:
                     final_docs.append(new_doc)
                     document_indexes.append(i)

From 01d6e22f7cfc52005f1a87819e83a1c29e499f12 Mon Sep 17 00:00:00 2001
From: February71st <lbowen@g.hmc.edu>
Date: Tue, 15 Apr 2025 10:19:13 -0700
Subject: [PATCH 3/5] Restored num_processes=10 to multiprocess test in
 preprocessing, after accidentally removing it while creating a test for my
 fix.

---
 tests/test_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index bf033025..c7ec2c6a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -54,7 +54,7 @@ def test_preprocessing_english_stops_split(data_dir):
 def test_preprocessing_multiprocess(data_dir):
     texts_path = data_dir+"/sample_texts/unprepr_docs.txt"
     p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True,
-                      lemmatize=False, split=False,
+                      lemmatize=False, num_processes=10, split=False,
                       min_chars=2, min_words_docs=1)
     dataset = p.preprocess_dataset(
         documents_path=texts_path,

From 04d70426b588e69d3ff1699cd7c3af4c46182425 Mon Sep 17 00:00:00 2001
From: February71st <lbowen@g.hmc.edu>
Date: Tue, 15 Apr 2025 10:32:16 -0700
Subject: [PATCH 4/5] Edited my test for readability, added docstring.

---
 tests/test_datasets.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index c7ec2c6a..bf7e898e 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -64,7 +64,11 @@ def test_preprocessing_multiprocess(data_dir):
     dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts")
 
 
-def test_preprocessing_nothing(data_dir):
+def test_preprocessing_minimal(data_dir):
+    """
+    This test is checking to make sure preprocessing does not remove tokens which the user doe not
+    specify should be removed.
+    """
     texts_path = data_dir+"/sample_texts/unprepr_docs.txt"
     p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=False,
                       remove_numbers = False,
@@ -72,17 +76,17 @@ def test_preprocessing_nothing(data_dir):
                       min_chars=1, min_words_docs=0)
     
     unprocessed = [d.strip() for d in open(texts_path, "r").readlines() if len(d.strip()) > 0]
-    lens = [len(d.split()) for d in unprocessed]
+    raw_word_lens = [len(d.split()) for d in unprocessed]
 
     dataset = p.preprocess_dataset(
         documents_path=texts_path,
     )
     print(dataset.get_corpus())
-    lens_pros = [len(d) for d in dataset.get_corpus()]
-    print(list(zip(lens,lens_pros)))
-    assert len(lens) == len(lens_pros)
-    for i in range(len(lens_pros)):
-        assert lens[i] == lens_pros[i]
+    preprocessed_word_lens = [len(d) for d in dataset.get_corpus()]
+    print(list(zip(raw_word_lens,preprocessed_word_lens)))
+    assert len(raw_word_lens) == len(preprocessed_word_lens)
+    for i in range(len(preprocessed_word_lens)):
+        assert raw_word_lens[i] == preprocessed_word_lens[i]
 
     dataset.save(data_dir+"/sample_texts/")
     dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts")

From 35f4d9d3dccd377eb543bfd49784ceb2bef43228 Mon Sep 17 00:00:00 2001
From: February71st <lbowen@g.hmc.edu>
Date: Tue, 15 Apr 2025 10:34:00 -0700
Subject: [PATCH 5/5] Removed unneccessary code from my test and fixed a typo
 in the docstring.

---
 tests/test_datasets.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index bf7e898e..569cfab0 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -66,7 +66,7 @@ def test_preprocessing_multiprocess(data_dir):
 
 def test_preprocessing_minimal(data_dir):
     """
-    This test is checking to make sure preprocessing does not remove tokens which the user doe not
+    This test is checking to make sure preprocessing does not remove tokens which the user does not
     specify should be removed.
     """
     texts_path = data_dir+"/sample_texts/unprepr_docs.txt"
@@ -88,9 +88,6 @@ def test_preprocessing_minimal(data_dir):
     for i in range(len(preprocessed_word_lens)):
         assert raw_word_lens[i] == preprocessed_word_lens[i]
 
-    dataset.save(data_dir+"/sample_texts/")
-    dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts")
-
 
 def test_load_20ng():
     data_home = get_data_home(data_home=None)