Compare commits

..

6 Commits

6 changed files with 228 additions and 0 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Ignore large dictionary
kaikki.org-dictionary*
dictionary/
# ---> Node
# Logs
logs

3
cleanbuild.sh Executable file
View File

@ -0,0 +1,3 @@
rm -rf ./dictionary/
node gendictionary

21
downloadDict.sh Executable file
View File

@ -0,0 +1,21 @@
curl https://gitea.apexfight.net/apex/valDict/releases/download/6%2F24%2F25/kaikki.org-dictionary-English.jsonl.gz | gzip -dc > kaikki.org-dictionary-English.jsonl
echo calculating sha256 sum of download...
sha256sum kaikki.org-dictionary-English.jsonl > downloadsum
# correct sum
echo 5c8f1d50d4bf93113ffad1b2799d12757b3bf82d29ca5d2fbe1368f05befa91c kaikki.org-dictionary-English.jsonl > correctsum
echo Diff of downloaded sum to correct sum:
diff ./correctsum ./downloadsum
sleep 1
echo Cleaning up...
rm downloadsum correctsum
echo Size of dictionary:
du -sh kaikki.org-dictionary-English.jsonl

91
gendictionary.js Normal file
View File

@ -0,0 +1,91 @@
const fs = require('fs');
const rl = require('readline')
const dictPath = "./dictionary/";
const language = "en"
let stream = fs.createReadStream("./kaikki.org-dictionary-English.jsonl", 'utf-8');
let reader = rl.createInterface(stream);
const path = dictPath + language + "/";
fs.mkdirSync(path,{recursive:true});
var iter = 0;
reader.on('line', (line) => {
iter++;
console.log("Iteration Number: "+iter.toString());
let entry = JSON.parse(line);
let thispath = path + entry.word + "/" + entry.pos + "/";
console.log(thispath);
if (!fs.existsSync(thispath)) {
initializeDir(thispath);
}
// console.log(entry);
let defintions = writeDefinitions(thispath, entry);
let sounds = writeSounds(thispath, entry);
let thesaurus = writeThesaurus(thispath, entry);
});
async function writeThesaurus(thispath, entry) {
}
async function writeSounds(thispath, entry) {
var sounds = JSON.parse(fs.readFileSync(thispath+"sounds.json",'utf-8'));
if (entry.sounds == null){
console.log("No sounds key present on entry, skipping sounds def...");
return;
}
entry.sounds.forEach((ele,idx,arr) => {
if (ele.audio != null) {
sounds.audios.push(ele.ogg_url, ele.mp3_url);
} else if (ele.rhymes != null) {
sounds.rhymes.push(ele.rhymes)
} else if (ele.homophone != null) {
sounds.homophones.push(ele.homophone);
} else {
sounds.pronunciations.push(ele);
}
});
fs.writeFileSync(thispath+"sounds.json",JSON.stringify(sounds));
}
async function writeDefinitions(thispath, entry) {
var definitions = JSON.parse(fs.readFileSync(thispath+"definitions.json",'utf-8'));
let senses = entry.senses;
if (senses == null){
console.log("No senses key present on entry, skipping senses def");
return;
}
senses.forEach((ele,idx,arr) => {
//add glosses onto defnitions list
definitions.glosses.push(ele.glosses);
});
fs.writeFileSync(thispath+"definitions.json", JSON.stringify(definitions));
}
function initializeDir(path) {
fs.mkdirSync(path, {recursive:true});
let definitions = {
glosses: []
}
let sounds = {
pronunciations: [],
audios: [],
rhymes: [],
homophones: []
}
let thesaurus = {
synonyms: [],
antynoms: []
}
fs.writeFileSync(path+"definitions.json",JSON.stringify(definitions));
fs.writeFileSync(path+"sounds.json",JSON.stringify(sounds));
fs.writeFileSync(path+"thesaurus.json",JSON.stringify(thesaurus));
}

93
notes.jsonc Normal file
View File

@ -0,0 +1,93 @@
/*
THIS FILE IS NOT VALID JSON, ITS JUST NOTES ABOUT THE EXPECTED
STRUCTURE IN OBJECTS LOCATED IN A KAIKKI DICTIONARY
*/
//relevant object data to extract
{
"word": string,
"lang_code": string, //en, es, pt, etc...
"pos": string, //part of speech, noun, verb, etc
"senses": [ //various definitions of the word
{
"glosses": [ //actual definition
string,
]
}
],
"sounds": [
{
"tags": [
//recieved pronunciation -important
//US
],
"ipa": string
//OR
"enpr": string
},
{
"audio": string,
"ogg_url": URLSTRING
},
{
"rhymes": string
},
{
"homophone": string
},
],
}
//PERMUTE TO >
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/definitions.json
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/sounds.json
//definitions.json
{
"glosses": [ //combined list of glosses
]
}
//sounds.json
{
"pronunciations": [ //copies of pronunciation objects from sounds lists
],
"audios": [ //copies of audio objects from sounds lists
],
"rhymes": [ //collections of strings from rhyme key of rhyme object from sounds lists
],
"homophones": [ //same as rhymes but with homphones
]
}
//thesaurus.json
{
synonyms: [
],
antynoms: [
]
}

16
package.json Normal file
View File

@ -0,0 +1,16 @@
{
"name": "valdict",
"version": "1.0.0",
"description": "A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.",
"repository": {
"type": "git",
"url": "https://gitea.apexfight.net/apex/valDict.git"
},
"license": " GFDL-1.3-or-later",
"author": "",
"type": "commonjs",
"main": "main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
}
}