-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplitToLB.py
More file actions
126 lines (88 loc) · 3.96 KB
/
Copy pathsplitToLB.py
File metadata and controls
126 lines (88 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# NOTES
# Lengths of vowels and accents are not marked.
# dissapearing n - at the end of a hexameter line leading to a vowel at hte stat of the next hexameter
# also https://en.wikipedia.org/wiki/Linear_B#CITEREFWiseman2010
# pos comparative https://archive.org/stream/linear-a-and-linear-b/a-mycenaean-iliad_djvu.txt
import numpy as np
deleted_array = ['','']
transcribed_array = ['','']
def is_v(char)
def is_vowel(char):
vowels = {'α', 'ά', 'ε', 'έ', 'η', 'ή', 'ι', 'ί', 'ϊ', 'ΐ', 'ο', 'ό', 'υ', 'ύ', 'ϋ', 'ΰ', 'ω', 'ώ',
'Α', 'Ά', 'Ε', 'Έ', 'Η', 'Ή', 'Ι', 'Ί', 'Ϊ', 'Ο', 'Ό', 'Υ', 'Ύ', 'Ϋ', 'Ω', 'Ώ'}
return char in vowels
def is_consonant(char):
# List of Greek consonants, both uppercase and lowercase
consonants = {'Β', 'Γ', 'Δ', 'Ζ', 'Θ', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Π', 'Ρ', 'Σ', 'Τ', 'Φ', 'Χ', 'Ψ',
'β', 'γ', 'δ', 'ζ', 'θ', 'κ', 'λ', 'μ', 'ν', 'ξ', 'π', 'ρ', 'σ', 'τ', 'φ', 'χ', 'ψ'}
return char in consonants
def removeApostrophe(word, index):
# remove apostrophe if next word begins with vowel
loneC(word)
def dropLastC(word):
# Major rule is remove the final consonant from words
# except ksi and psi are effecively double consonants
# double consonants should also be dropped but but documented as useful for other metrics
def tripleCstart(word):
# usually drop first eg. stratos -> tratos
def quadCstart(word):
# error just flag and log
def nounException(word):
# some nouns are exceptions and should be preserved e.g. Faistos.
# not statistically significant but need parsing - build table of hapax at start?
def doubleC(word):
# find all consonant pairs and track original location
# for each consonant pair
# if consonant pair starts with m or n
# if NOT mn or nm
# drop starting m or n
# m, n, l*, r* - at start of pair is dropped
# * handle with lambda()
# fi-theta = pt
# pp = q ... tbc
# any consonant pairl left drop second C except for
def terminal_psi_ksi(word):
# drop these as with others bu record
def loneC(word):
# single consonants at end of word - these are due to removal of apostraphes. Also flag.
def vowels(word):
# single vowels map directly
def lamdaR():
# l at the start of constant pair is dropped
# lambda becomes r
# r at the start of a consonant pair is dropped
def diphongs():
# handle diachritics
# can be either depending on metric rules - this can be handled with a diacrtic in text (when available) future development
#ou, #ai,
#au #eu only dipthongs at the beginning
#ei ai oi
#ui -
#i-vowerl transcribed with j (except i)
#ih -
def prepositions(word):
# handle prepositions en and ev atthe start of the word. tbc
def find_three_consonant_starts(text):
occurrences = []
words = text.split()
for index, word in enumerate(words):
if len(word) >= 3 and all(is_consonant(char) for char in word[0:3]):
occurrences.append((word[:3], index))
return occurrences
#note try running in deifferent sequence to see how t impacts the final stats.
# corpus = #get text
# split into corpus by white space
# for each word
# if the first 4 or more characters are a consonant then
# move word to array called 'anomolies' and record the orginal index position
# move to next word
else
# handle terminal apostrophe & resulting termincal consonants
# if tolower(normalize(word)).startswith( 'ευ', 'εν'))
# split word on patern
# push first word to delete_array (equivalent to dropping first letter but with a record)
# if the first 3 characters are a consonant then
# drop the first consontant
# note maybe some exceptons
# handle consonant triplets?
# handle consonant pairs.....?