Compare commits

..

No commits in common. "main" and "6/24/25" have entirely different histories.

7 changed files with 1 additions and 269 deletions

4
.gitignore vendored
View File

@ -1,7 +1,3 @@
# Ignore large dictionary
kaikki.org-dictionary*
dictionary/
# ---> Node
# Logs
logs

View File

@ -1,24 +1,4 @@
# valDict
A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.
valDict can be served using your choice of web server.
## Building
Firstly, you need the "large_dir" option enabled in your filesystem and "dir_index" will also help speed things up, they can be enabled with
```bash
tune2fs -O large_dir /dev/DEVICE
tune2fs -O dir_index /dev/DEVICE
#DEVICE can be found with lsblk
```
IN ADDITION, you need like... ALOT of Inodes like 10 million, i highly recommend making a DEDICATED ext4 partition JUST FOR this dictionary that has an outrageous inode count (remember to set the large_dir and dir_index in the new fs)
personally i recommend a 40-45GB that is 45,000,000,000 bytes / 10,000,000 goal inodes = 4,500 bytes per inode (waow that is crazy)
this is a decently efficent way to patition the format while still leaving a very decent amount of space left for extra files, or whatever future expansion may be required making a filesystem like this would probably require first making an ext4 partition with fdisk, then using mkfs.ext4 -i 3200 to achieve the 3200 bytes per inode ratio...
building valDict is very easy, just clone the repo and run: `./downloadDict.sh` then run `./cleanbuild.sh` (beware, building takes a LONGGGG time... (has to write 20gb~ worth of files))
RUN AT OWN RISK BE CAREFUL CAREFUL CAREFUL
valDict can be served using the built in react server that comes with this repo, or your choice of web server.

View File

@ -1,3 +0,0 @@
rm -rfv ./dictionary/
node gendictionary

View File

@ -1,21 +0,0 @@
curl https://gitea.apexfight.net/apex/valDict/releases/download/6%2F24%2F25/kaikki.org-dictionary-English.jsonl.gz | gzip -dc > kaikki.org-dictionary-English.jsonl
echo calculating sha256 sum of download...
sha256sum kaikki.org-dictionary-English.jsonl > downloadsum
# correct sum
echo 5c8f1d50d4bf93113ffad1b2799d12757b3bf82d29ca5d2fbe1368f05befa91c kaikki.org-dictionary-English.jsonl > correctsum
echo Diff of downloaded sum to correct sum:
diff ./correctsum ./downloadsum
sleep 1
echo Cleaning up...
rm downloadsum correctsum
echo Size of dictionary:
du -sh kaikki.org-dictionary-English.jsonl

View File

@ -1,111 +0,0 @@
const fs = require('fs');
const rl = require('readline')
const trie = true;
const trieLevel = 4; //size between cuts for trie ex 4 : "/exam/ple"
const dictPath = "./dictionary/";
const language = "en"
let stream = fs.createReadStream("./kaikki.org-dictionary-English.jsonl", 'utf-8');
let reader = rl.createInterface(stream);
const path = dictPath + language + "/";
fs.mkdirSync(path,{recursive:true});
var iter = 0;
reader.on('line', (line) => {
iter++;
console.log("Iteration Number: "+iter.toString());
let entry = JSON.parse(line);
let thispath = path + getPath(entry.word) + entry.pos + "/";
console.log(thispath);
if (!fs.existsSync(thispath + "definitions.json")) {
initializeDir(thispath);
}
// console.log(entry);
let defintions = writeDefinitions(thispath, entry);
let sounds = writeSounds(thispath, entry);
let thesaurus = writeThesaurus(thispath, entry);
});
function getPath(word){
let path = "";
if (trie){
for (let i = 0; i < word.length; i+=trieLevel){
for (let n = 0; n < trieLevel; n++){
path += word[i+n] ?? "";
}
path += "/";
}
} else {
path = word + '/';
}
return path.toLowerCase();
}
function writeThesaurus(thispath, entry) {
}
function writeSounds(thispath, entry) {
var sounds = JSON.parse(fs.readFileSync(thispath+"sounds.json",'utf-8'));
if (entry.sounds == null){
console.log("No sounds key present on entry, skipping sounds def...");
return;
}
entry.sounds.forEach((ele,idx,arr) => {
if (ele.audio != null) {
sounds.audios.push(ele.ogg_url, ele.mp3_url);
} else if (ele.rhymes != null) {
sounds.rhymes.push(ele.rhymes)
} else if (ele.homophone != null) {
sounds.homophones.push(ele.homophone);
} else {
sounds.pronunciations.push(ele);
}
});
fs.writeFileSync(thispath+"sounds.json",JSON.stringify(sounds));
sounds = null;
}
function writeDefinitions(thispath, entry) {
var definitions = JSON.parse(fs.readFileSync(thispath+"definitions.json",'utf-8'));
let senses = entry.senses;
if (senses == null){
console.log("No senses key present on entry, skipping senses def");
return;
}
senses.forEach((ele,idx,arr) => {
//add glosses onto defnitions list
definitions.glosses.push(ele.glosses);
});
fs.writeFileSync(thispath+"definitions.json", JSON.stringify(definitions));
definitions = null;
}
function initializeDir(path) {
fs.mkdirSync(path, {recursive:true});
let definitions = {
glosses: []
}
let sounds = {
pronunciations: [],
audios: [],
rhymes: [],
homophones: []
}
let thesaurus = {
synonyms: [],
antynoms: []
}
fs.writeFileSync(path+"definitions.json",JSON.stringify(definitions));
fs.writeFileSync(path+"sounds.json",JSON.stringify(sounds));
fs.writeFileSync(path+"thesaurus.json",JSON.stringify(thesaurus));
}

View File

@ -1,93 +0,0 @@
/*
THIS FILE IS NOT VALID JSON, ITS JUST NOTES ABOUT THE EXPECTED
STRUCTURE IN OBJECTS LOCATED IN A KAIKKI DICTIONARY
*/
//relevant object data to extract
{
"word": string,
"lang_code": string, //en, es, pt, etc...
"pos": string, //part of speech, noun, verb, etc
"senses": [ //various definitions of the word
{
"glosses": [ //actual definition
string,
]
}
],
"sounds": [
{
"tags": [
//recieved pronunciation -important
//US
],
"ipa": string
//OR
"enpr": string
},
{
"audio": string,
"ogg_url": URLSTRING
},
{
"rhymes": string
},
{
"homophone": string
},
],
}
//PERMUTE TO >
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/definitions.json
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/sounds.json
//definitions.json
{
"glosses": [ //combined list of glosses
]
}
//sounds.json
{
"pronunciations": [ //copies of pronunciation objects from sounds lists
],
"audios": [ //copies of audio objects from sounds lists
],
"rhymes": [ //collections of strings from rhyme key of rhyme object from sounds lists
],
"homophones": [ //same as rhymes but with homphones
]
}
//thesaurus.json
{
synonyms: [
],
antynoms: [
]
}

View File

@ -1,16 +0,0 @@
{
"name": "valdict",
"version": "1.0.0",
"description": "A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.",
"repository": {
"type": "git",
"url": "https://gitea.apexfight.net/apex/valDict.git"
},
"license": " GFDL-1.3-or-later",
"author": "",
"type": "commonjs",
"main": "main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
}
}