forked from psankar/korkai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVU-dictionary-parser.go
97 lines (80 loc) · 1.8 KB
/
VU-dictionary-parser.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// Author: Sankar P <[email protected]>
// Can be used to parse the VU Dictionary at https://github.com/rprabhu/TamilDictionary
package main
import (
"bufio"
"encoding/json"
"io/ioutil"
"log"
"os"
"sort"
"strings"
)
func main() {
files, err := ioutil.ReadDir(".")
if err != nil {
log.Fatal(err)
}
f, err := os.Create("TamilVUDictionary.txt")
if err != nil {
log.Fatal(err)
}
w := bufio.NewWriter(f)
var words []string
for _, jsonFile := range files {
if !strings.HasSuffix(jsonFile.Name(), ".json") {
continue
}
log.Println("Parsing: ", jsonFile.Name())
b, err := ioutil.ReadFile(jsonFile.Name())
if err != nil {
log.Fatal(err)
}
var data map[string]string
if err = json.Unmarshal(b, &data); err != nil {
panic(err)
}
for k, v := range data {
w.WriteString(k + "\n")
// log.Println(k, v)
ws := strings.FieldsFunc(v, func(r rune) bool {
switch r {
case '(', ';', ' ', '.', ')', ',', ':', '\'':
return true
}
return false
})
words = append(words, ws...)
}
}
added := make(map[string]struct{})
var uwords []string
var zs struct{}
for _, word := range words {
if !strings.ContainsAny(word, "+") {
_, ok := added[word]
if !ok {
added[word] = zs
// Skip words ending in some letters
suffixes := []string{"க்", "ங்", "ச்", "ஞ்", "ட்", "த்", "ந்", "ப்", "வ்", "ற்"}
hasSuffix := false
for _, suffix := range suffixes {
if strings.HasSuffix(word, suffix) {
hasSuffix = true
break
}
}
if !hasSuffix {
uwords = append(uwords, word)
}
}
}
}
log.Println(uwords[:100], len(uwords))
sort.Strings(uwords)
log.Println(uwords[:100], len(uwords))
for i := 0; i < len(uwords); i++ {
w.WriteString(uwords[i] + "\n")
}
w.Flush()
}