-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreplaceNames.py
More file actions
92 lines (84 loc) · 3.27 KB
/
replaceNames.py
File metadata and controls
92 lines (84 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import xlrd
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
names = set()
with open('names.txt', 'r') as f:
for line in f:
names.add(line.rstrip())
patchWords = set()
with open('patch.txt', 'r') as f:
for line in f:
patchWords.add(line.rstrip())
commonWords = set()
with open('commonWords.txt', 'r') as f:
for line in f:
commonWords.add(line.rstrip())
#workbook = xlrd.open_workbook('fakeSessionNotes_dummyFile.xlsx')
#workbook = xlrd.open_workbook('AllData_sample.xlsx')
workbook = xlrd.open_workbook('AllData.xlsx')
worksheet = workbook.sheet_by_index(0)
nrows = worksheet.nrows
ncols = worksheet.ncols
namesList = []
# Bad code, hardcoding the column in which the data is present
# But this will do for now!
notes = worksheet.col(0, start_rowx=0)
lmtr = WordNetLemmatizer()
for note in notes:
# print('Original Note:\n', note.value)
# print(note.value)
wordsInNote = note.value.split()
# print(wordsInNote)
namesRemoved = []
for word in wordsInNote:
# Only consider the words in which only the first letter is capitalized while checking the list of common words. This is because
# these are the only canditates for being names.
m = re.search(r'[A-Z][a-z]+', word)
if m:
# group(0) returns the entire match.
stemWord = m.group(0)
else:
stemWord = ''
# print('word', word)
# print('stem', stemWord)
synonyms = []
# antonyms = []
for syn in wn.synsets(stemWord):
for l in syn.lemmas():
synonyms.append(l.name().lower())
# if l.antonyms():
# antonyms.append(l.antonyms()[0].name())
# print(set(synonyms))
# print(set(antonyms))
if (
stemWord and
stemWord.lower() in names or
(
stemWord.lower() not in commonWords and
stemWord.lower() not in patchWords and
not set(synonyms) & commonWords and
# not set(antonyms) & h and
# (not wn.synsets(stemWord) or wn.synsets(stemWord)[0].pos() not in ('r','v')) and
(not wn.synsets(stemWord) or all([synset.pos() == 'n' for synset in wn.synsets(stemWord)])) and
lmtr.lemmatize(stemWord.lower(), pos='v') == stemWord.lower() and
# Ignores common nouns which occur as plurals. The singular form is likely included in the list of common words.
len(lmtr.lemmatize(stemWord.lower(), pos='n')) == len(stemWord)
)
):
# Suggestion: compare stemWord with word, and append the punctuation to the [NAME] token.
namesRemoved.append('[NAME]')
namesList.append(stemWord)
else:
namesRemoved.append(word)
# namesRemoved is the list of tokens with the names redacted. This statement creates sentences from them by including spaces between
# them.
outputNote = ' '.join(namesRemoved)
# print('Redacted Note: \n', outputNote)
print(outputNote)
print()
print('total number of words labeled as names : ', len(namesList))
print('total number of distinct words labeled as names : ', len(set(namesList)))
from collections import Counter
print(Counter(namesList))