splitToLB/splitToLB.py at main · gapantos/splitToLB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126


# NOTES
# Lengths of vowels and accents are not marked.
# dissapearing n - at the end of a hexameter line leading to a vowel at hte stat of the next hexameter
# also https://en.wikipedia.org/wiki/Linear_B#CITEREFWiseman2010

# pos comparative https://archive.org/stream/linear-a-and-linear-b/a-mycenaean-iliad_djvu.txt

import numpy as np

deleted_array = ['','']
transcribed_array = ['','']

def is_v(char)
    def is_vowel(char):
    vowels = {'α', 'ά', 'ε', 'έ', 'η', 'ή', 'ι', 'ί', 'ϊ', 'ΐ', 'ο', 'ό', 'υ', 'ύ', 'ϋ', 'ΰ', 'ω', 'ώ',
              'Α', 'Ά', 'Ε', 'Έ', 'Η', 'Ή', 'Ι', 'Ί', 'Ϊ', 'Ο', 'Ό', 'Υ', 'Ύ', 'Ϋ', 'Ω', 'Ώ'}
    return char in vowels

def is_consonant(char):
    # List of Greek consonants, both uppercase and lowercase
    consonants = {'Β', 'Γ', 'Δ', 'Ζ', 'Θ', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Π', 'Ρ', 'Σ', 'Τ', 'Φ', 'Χ', 'Ψ',
                  'β', 'γ', 'δ', 'ζ', 'θ', 'κ', 'λ', 'μ', 'ν', 'ξ', 'π', 'ρ', 'σ', 'τ', 'φ', 'χ', 'ψ'}
    return char in consonants

def removeApostrophe(word, index):
    # remove apostrophe if next word begins with vowel
    loneC(word)

def dropLastC(word):
    # Major rule is remove the final consonant from words
    # except ksi and psi are effecively double consonants
    # double consonants should also be dropped but but documented as useful for other metrics

def tripleCstart(word):
    # usually drop first  eg. stratos -> tratos

def quadCstart(word):
    # error just flag and log

def nounException(word):
    # some nouns are exceptions and should be preserved e.g. Faistos.
    # not statistically significant but need parsing - build table of hapax at start?

def doubleC(word):

    # find all consonant pairs and track original location
    # for each consonant pair
        # if consonant pair starts with m or n
        #   if NOT mn or nm
                # drop starting m or n


        # m, n, l*, r* - at start of pair is dropped
        # * handle with lambda()

        # fi-theta = pt
        # pp = q ... tbc

        # any consonant pairl left drop second C except for


def terminal_psi_ksi(word):
    # drop these as with others bu record

def loneC(word):
    # single consonants at end of word - these are due to removal of apostraphes.  Also flag.

def vowels(word):
    # single vowels map directly

def lamdaR():
    # l at the start of constant pair is dropped
    # lambda becomes r
    # r at the start of a consonant pair is dropped

def diphongs():
    # handle diachritics
    #   can be either depending on metric rules -  this can be handled with a diacrtic in text (when available) future development

    #ou, #ai,
    #au #eu only dipthongs at the beginning
    #ei ai oi
    #ui -
    #i-vowerl transcribed with j (except i)
    #ih -

def prepositions(word):
    # handle prepositions en and ev atthe start of the word.  tbc

def find_three_consonant_starts(text):
    occurrences = []
    words = text.split()

    for index, word in enumerate(words):
        if len(word) >= 3 and all(is_consonant(char) for char in word[0:3]):
            occurrences.append((word[:3], index))

    return occurrences


#note try running in deifferent sequence to see how t impacts the final stats.

    # corpus = #get text
    # split into corpus by white space
    # for each word
        # if the first 4 or more characters are a consonant then
            # move word to array called 'anomolies' and record the orginal index position
            # move to next word
        else
        # handle terminal apostrophe & resulting termincal consonants

        # if tolower(normalize(word)).startswith( 'ευ',  'εν'))
            # split word on patern
            # push first word to delete_array (equivalent to dropping first letter but with a record)

        # if the first 3 characters are a consonant then
            # drop the first consontant
            # note maybe some exceptons


        # handle consonant triplets?

        # handle consonant pairs.....?