From 953b91e8b7bdb42e64758dc3fe98f464ebb9c5b2 Mon Sep 17 00:00:00 2001 From: February71st Date: Tue, 8 Apr 2025 10:21:38 -0700 Subject: [PATCH 1/5] Edited preprocessing to fix an error where a mismatch in tokenisation caused punctuation and tokens attached to it would be removed regardless of settings --- octis/preprocessing/preprocessing.py | 21 +++++++++++++++------ tests/test_datasets.py | 26 +++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index 64ede3ee..d730dac1 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -1,6 +1,6 @@ import string from typing import List, Union - +import re import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split @@ -9,7 +9,7 @@ from pathlib import Path from octis.dataset.dataset import Dataset from collections import Counter - +print("Using the correct version") """ Maps the language to its corresponding spacy model """ @@ -156,11 +156,13 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) :return octis.dataset.dataset.Dataset """ docs = [line.strip() for line in open(documents_path, 'r').readlines()] + if self.lowercase: + docs = [d.lower() for d in docs] if self.num_processes is not None: # with Pool(self.num_processes) as p: # docs = p.map(self.simple_preprocessing_steps, docs) chunksize = max(1, len(docs) // (self.num_processes * 20)) - docs_list = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=chunksize) + docs = process_map(self.simple_preprocessing_steps, docs, max_workers=self.num_processes, chunksize=chunksize) else: docs = list(map(self.simple_preprocessing_steps, tqdm(docs))) if self.lowercase: @@ -174,6 +176,8 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) print("created vocab") print(len(vocabulary)) final_docs, final_labels, document_indexes = [], [], [] + + if labels_path is not None: if multilabel: labels = [ @@ -183,10 +187,15 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) labels = [ line.strip() for line in open(labels_path, 'r').readlines()] - + vocab = set(vocabulary) + + for i, doc, label in zip(range(len(docs)), docs, labels): - new_doc = [w for w in doc.split() if w in vocab] + + + new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))] + if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) final_labels.append(label) @@ -206,7 +215,7 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) else: vocab = set(vocabulary) for i, doc in enumerate(docs): - new_doc = [w for w in doc.split() if w in vocab] + new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))] if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) document_indexes.append(i) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e451f8f6..bf033025 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -54,7 +54,7 @@ def test_preprocessing_english_stops_split(data_dir): def test_preprocessing_multiprocess(data_dir): texts_path = data_dir+"/sample_texts/unprepr_docs.txt" p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True, - lemmatize=False, num_processes=10, split=False, + lemmatize=False, split=False, min_chars=2, min_words_docs=1) dataset = p.preprocess_dataset( documents_path=texts_path, @@ -64,6 +64,30 @@ def test_preprocessing_multiprocess(data_dir): dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") +def test_preprocessing_nothing(data_dir): + texts_path = data_dir+"/sample_texts/unprepr_docs.txt" + p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=False, + remove_numbers = False, + lemmatize=False, split=False, + min_chars=1, min_words_docs=0) + + unprocessed = [d.strip() for d in open(texts_path, "r").readlines() if len(d.strip()) > 0] + lens = [len(d.split()) for d in unprocessed] + + dataset = p.preprocess_dataset( + documents_path=texts_path, + ) + print(dataset.get_corpus()) + lens_pros = [len(d) for d in dataset.get_corpus()] + print(list(zip(lens,lens_pros))) + assert len(lens) == len(lens_pros) + for i in range(len(lens_pros)): + assert lens[i] == lens_pros[i] + + dataset.save(data_dir+"/sample_texts/") + dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") + + def test_load_20ng(): data_home = get_data_home(data_home=None) cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz") From 2c825010d568d4a1dc9182b937f8f914d9e99b91 Mon Sep 17 00:00:00 2001 From: February71st Date: Tue, 15 Apr 2025 00:30:41 -0700 Subject: [PATCH 2/5] Cleaned up my previous code in preprocessing --- octis/preprocessing/preprocessing.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/octis/preprocessing/preprocessing.py b/octis/preprocessing/preprocessing.py index d730dac1..1c7c6caf 100644 --- a/octis/preprocessing/preprocessing.py +++ b/octis/preprocessing/preprocessing.py @@ -1,6 +1,6 @@ +import re import string from typing import List, Union -import re import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split @@ -9,7 +9,6 @@ from pathlib import Path from octis.dataset.dataset import Dataset from collections import Counter -print("Using the correct version") """ Maps the language to its corresponding spacy model """ @@ -156,8 +155,6 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) :return octis.dataset.dataset.Dataset """ docs = [line.strip() for line in open(documents_path, 'r').readlines()] - if self.lowercase: - docs = [d.lower() for d in docs] if self.num_processes is not None: # with Pool(self.num_processes) as p: # docs = p.map(self.simple_preprocessing_steps, docs) @@ -177,6 +174,10 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) print(len(vocabulary)) final_docs, final_labels, document_indexes = [], [], [] + def valid_word_or_punc(word): + valid_word = len([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", word) if rw in vocab]) > 0 + all_punc = len(word) == len(re.findall(r'[^\w]',word)) + return valid_word or all_punc if labels_path is not None: if multilabel: @@ -187,15 +188,11 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) labels = [ line.strip() for line in open(labels_path, 'r').readlines()] - - vocab = set(vocabulary) - + vocab = set(vocabulary) for i, doc, label in zip(range(len(docs)), docs, labels): + new_doc = [w for w in doc.split() if valid_word_or_punc(w)] - - new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))] - if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) final_labels.append(label) @@ -215,7 +212,7 @@ def preprocess_dataset(self, documents_path, labels_path=None, multilabel=False) else: vocab = set(vocabulary) for i, doc in enumerate(docs): - new_doc = [w for w in doc.split() if ([rw for rw in re.findall(r"(?u)\b[\w|\-]{" + str(self.min_chars) + r",}\b", w) if rw in vocab] or (len(w) == len(re.findall(r'[^\w]',w))))] + new_doc = [w for w in doc.split() if valid_word_or_punc(w)] if len(new_doc) > self.min_doc_words: final_docs.append(new_doc) document_indexes.append(i) From 01d6e22f7cfc52005f1a87819e83a1c29e499f12 Mon Sep 17 00:00:00 2001 From: February71st Date: Tue, 15 Apr 2025 10:19:13 -0700 Subject: [PATCH 3/5] Restored num_processes=10 to multiprocess test in preprocessing, after accidentally removing it while creating a test for my fix. --- tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index bf033025..c7ec2c6a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -54,7 +54,7 @@ def test_preprocessing_english_stops_split(data_dir): def test_preprocessing_multiprocess(data_dir): texts_path = data_dir+"/sample_texts/unprepr_docs.txt" p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True, - lemmatize=False, split=False, + lemmatize=False, num_processes=10, split=False, min_chars=2, min_words_docs=1) dataset = p.preprocess_dataset( documents_path=texts_path, From 04d70426b588e69d3ff1699cd7c3af4c46182425 Mon Sep 17 00:00:00 2001 From: February71st Date: Tue, 15 Apr 2025 10:32:16 -0700 Subject: [PATCH 4/5] Edited my test for readability, added docstring. --- tests/test_datasets.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index c7ec2c6a..bf7e898e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -64,7 +64,11 @@ def test_preprocessing_multiprocess(data_dir): dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") -def test_preprocessing_nothing(data_dir): +def test_preprocessing_minimal(data_dir): + """ + This test is checking to make sure preprocessing does not remove tokens which the user doe not + specify should be removed. + """ texts_path = data_dir+"/sample_texts/unprepr_docs.txt" p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=False, remove_numbers = False, @@ -72,17 +76,17 @@ def test_preprocessing_nothing(data_dir): min_chars=1, min_words_docs=0) unprocessed = [d.strip() for d in open(texts_path, "r").readlines() if len(d.strip()) > 0] - lens = [len(d.split()) for d in unprocessed] + raw_word_lens = [len(d.split()) for d in unprocessed] dataset = p.preprocess_dataset( documents_path=texts_path, ) print(dataset.get_corpus()) - lens_pros = [len(d) for d in dataset.get_corpus()] - print(list(zip(lens,lens_pros))) - assert len(lens) == len(lens_pros) - for i in range(len(lens_pros)): - assert lens[i] == lens_pros[i] + preprocessed_word_lens = [len(d) for d in dataset.get_corpus()] + print(list(zip(raw_word_lens,preprocessed_word_lens))) + assert len(raw_word_lens) == len(preprocessed_word_lens) + for i in range(len(preprocessed_word_lens)): + assert raw_word_lens[i] == preprocessed_word_lens[i] dataset.save(data_dir+"/sample_texts/") dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") From 35f4d9d3dccd377eb543bfd49784ceb2bef43228 Mon Sep 17 00:00:00 2001 From: February71st Date: Tue, 15 Apr 2025 10:34:00 -0700 Subject: [PATCH 5/5] Removed unneccessary code from my test and fixed a typo in the docstring. --- tests/test_datasets.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index bf7e898e..569cfab0 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -66,7 +66,7 @@ def test_preprocessing_multiprocess(data_dir): def test_preprocessing_minimal(data_dir): """ - This test is checking to make sure preprocessing does not remove tokens which the user doe not + This test is checking to make sure preprocessing does not remove tokens which the user does not specify should be removed. """ texts_path = data_dir+"/sample_texts/unprepr_docs.txt" @@ -88,9 +88,6 @@ def test_preprocessing_minimal(data_dir): for i in range(len(preprocessed_word_lens)): assert raw_word_lens[i] == preprocessed_word_lens[i] - dataset.save(data_dir+"/sample_texts/") - dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") - def test_load_20ng(): data_home = get_data_home(data_home=None)