Compare commits

..

22 Commits

Author SHA1 Message Date
cd4838f692 Update README.md 2025-06-29 04:21:52 +00:00
25612ab51c Update README.md 2025-06-28 22:32:40 +00:00
9df1547fce fixed a edge case bug where existence check would succed even when it shouldnt when using the trie 2025-06-28 13:15:10 -04:00
6efed770d0 optimized trie and added optional configuration :3 2025-06-28 12:53:03 -04:00
5f485ac73a Update README.md 2025-06-28 16:08:37 +00:00
6a58b8f48d Update README.md 2025-06-28 07:36:46 +00:00
9538f78198 Update README.md 2025-06-28 07:32:55 +00:00
4059d6d868 Update README.md 2025-06-28 07:10:02 +00:00
1a2358387f Update README.md 2025-06-28 07:01:08 +00:00
b4d5966550 path splitting 2025-06-28 02:31:30 -04:00
008126c5f4 Update README.md 2025-06-28 03:43:14 +00:00
7021ab967c made cleanbuild more verbose 2025-06-25 05:12:09 -04:00
dcb9ae2f68 optimized compiler for GC 2025-06-25 04:32:13 -04:00
e5f1227948 Update README.md 2025-06-25 07:04:10 +00:00
3074606ea5 Update README.md 2025-06-25 06:46:17 +00:00
c5b4b15559 Update README.md 2025-06-25 06:44:25 +00:00
47015d8f40 cleanbuild dictionary script 2025-06-25 02:17:30 -04:00
801c17f4c2 npm manifest 2025-06-25 02:16:35 -04:00
c0a73acd03 notes documenting input and output objects of gendictionary compiler 2025-06-25 02:16:26 -04:00
a7e6424882 code to compile a jsonl file to a complete FS 2025-06-25 02:16:02 -04:00
afabae41d6 dict download scripts w checksum 2025-06-25 02:15:37 -04:00
396b666715 added exclusions to compiled dictionaries 2025-06-25 02:15:14 -04:00
7 changed files with 269 additions and 1 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Ignore large dictionary
kaikki.org-dictionary*
dictionary/
# ---> Node # ---> Node
# Logs # Logs
logs logs

View File

@ -1,4 +1,24 @@
# valDict # valDict
A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**. A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.
valDict can be served using the built in react server that comes with this repo, or your choice of web server. valDict can be served using your choice of web server.
## Building
Firstly, you need the "large_dir" option enabled in your filesystem and "dir_index" will also help speed things up, they can be enabled with
```bash
tune2fs -O large_dir /dev/DEVICE
tune2fs -O dir_index /dev/DEVICE
#DEVICE can be found with lsblk
```
IN ADDITION, you need like... ALOT of Inodes like 10 million, i highly recommend making a DEDICATED ext4 partition JUST FOR this dictionary that has an outrageous inode count (remember to set the large_dir and dir_index in the new fs)
personally i recommend a 40-45GB that is 45,000,000,000 bytes / 10,000,000 goal inodes = 4,500 bytes per inode (waow that is crazy)
this is a decently efficent way to patition the format while still leaving a very decent amount of space left for extra files, or whatever future expansion may be required making a filesystem like this would probably require first making an ext4 partition with fdisk, then using mkfs.ext4 -i 3200 to achieve the 3200 bytes per inode ratio...
building valDict is very easy, just clone the repo and run: `./downloadDict.sh` then run `./cleanbuild.sh` (beware, building takes a LONGGGG time... (has to write 20gb~ worth of files))
RUN AT OWN RISK BE CAREFUL CAREFUL CAREFUL

3
cleanbuild.sh Executable file
View File

@ -0,0 +1,3 @@
rm -rfv ./dictionary/
node gendictionary

21
downloadDict.sh Executable file
View File

@ -0,0 +1,21 @@
curl https://gitea.apexfight.net/apex/valDict/releases/download/6%2F24%2F25/kaikki.org-dictionary-English.jsonl.gz | gzip -dc > kaikki.org-dictionary-English.jsonl
echo calculating sha256 sum of download...
sha256sum kaikki.org-dictionary-English.jsonl > downloadsum
# correct sum
echo 5c8f1d50d4bf93113ffad1b2799d12757b3bf82d29ca5d2fbe1368f05befa91c kaikki.org-dictionary-English.jsonl > correctsum
echo Diff of downloaded sum to correct sum:
diff ./correctsum ./downloadsum
sleep 1
echo Cleaning up...
rm downloadsum correctsum
echo Size of dictionary:
du -sh kaikki.org-dictionary-English.jsonl

111
gendictionary.js Normal file
View File

@ -0,0 +1,111 @@
const fs = require('fs');
const rl = require('readline')
const trie = true;
const trieLevel = 4; //size between cuts for trie ex 4 : "/exam/ple"
const dictPath = "./dictionary/";
const language = "en"
let stream = fs.createReadStream("./kaikki.org-dictionary-English.jsonl", 'utf-8');
let reader = rl.createInterface(stream);
const path = dictPath + language + "/";
fs.mkdirSync(path,{recursive:true});
var iter = 0;
reader.on('line', (line) => {
iter++;
console.log("Iteration Number: "+iter.toString());
let entry = JSON.parse(line);
let thispath = path + getPath(entry.word) + entry.pos + "/";
console.log(thispath);
if (!fs.existsSync(thispath + "definitions.json")) {
initializeDir(thispath);
}
// console.log(entry);
let defintions = writeDefinitions(thispath, entry);
let sounds = writeSounds(thispath, entry);
let thesaurus = writeThesaurus(thispath, entry);
});
function getPath(word){
let path = "";
if (trie){
for (let i = 0; i < word.length; i+=trieLevel){
for (let n = 0; n < trieLevel; n++){
path += word[i+n] ?? "";
}
path += "/";
}
} else {
path = word + '/';
}
return path.toLowerCase();
}
function writeThesaurus(thispath, entry) {
}
function writeSounds(thispath, entry) {
var sounds = JSON.parse(fs.readFileSync(thispath+"sounds.json",'utf-8'));
if (entry.sounds == null){
console.log("No sounds key present on entry, skipping sounds def...");
return;
}
entry.sounds.forEach((ele,idx,arr) => {
if (ele.audio != null) {
sounds.audios.push(ele.ogg_url, ele.mp3_url);
} else if (ele.rhymes != null) {
sounds.rhymes.push(ele.rhymes)
} else if (ele.homophone != null) {
sounds.homophones.push(ele.homophone);
} else {
sounds.pronunciations.push(ele);
}
});
fs.writeFileSync(thispath+"sounds.json",JSON.stringify(sounds));
sounds = null;
}
function writeDefinitions(thispath, entry) {
var definitions = JSON.parse(fs.readFileSync(thispath+"definitions.json",'utf-8'));
let senses = entry.senses;
if (senses == null){
console.log("No senses key present on entry, skipping senses def");
return;
}
senses.forEach((ele,idx,arr) => {
//add glosses onto defnitions list
definitions.glosses.push(ele.glosses);
});
fs.writeFileSync(thispath+"definitions.json", JSON.stringify(definitions));
definitions = null;
}
function initializeDir(path) {
fs.mkdirSync(path, {recursive:true});
let definitions = {
glosses: []
}
let sounds = {
pronunciations: [],
audios: [],
rhymes: [],
homophones: []
}
let thesaurus = {
synonyms: [],
antynoms: []
}
fs.writeFileSync(path+"definitions.json",JSON.stringify(definitions));
fs.writeFileSync(path+"sounds.json",JSON.stringify(sounds));
fs.writeFileSync(path+"thesaurus.json",JSON.stringify(thesaurus));
}

93
notes.jsonc Normal file
View File

@ -0,0 +1,93 @@
/*
THIS FILE IS NOT VALID JSON, ITS JUST NOTES ABOUT THE EXPECTED
STRUCTURE IN OBJECTS LOCATED IN A KAIKKI DICTIONARY
*/
//relevant object data to extract
{
"word": string,
"lang_code": string, //en, es, pt, etc...
"pos": string, //part of speech, noun, verb, etc
"senses": [ //various definitions of the word
{
"glosses": [ //actual definition
string,
]
}
],
"sounds": [
{
"tags": [
//recieved pronunciation -important
//US
],
"ipa": string
//OR
"enpr": string
},
{
"audio": string,
"ogg_url": URLSTRING
},
{
"rhymes": string
},
{
"homophone": string
},
],
}
//PERMUTE TO >
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/definitions.json
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/sounds.json
//definitions.json
{
"glosses": [ //combined list of glosses
]
}
//sounds.json
{
"pronunciations": [ //copies of pronunciation objects from sounds lists
],
"audios": [ //copies of audio objects from sounds lists
],
"rhymes": [ //collections of strings from rhyme key of rhyme object from sounds lists
],
"homophones": [ //same as rhymes but with homphones
]
}
//thesaurus.json
{
synonyms: [
],
antynoms: [
]
}

16
package.json Normal file
View File

@ -0,0 +1,16 @@
{
"name": "valdict",
"version": "1.0.0",
"description": "A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.",
"repository": {
"type": "git",
"url": "https://gitea.apexfight.net/apex/valDict.git"
},
"license": " GFDL-1.3-or-later",
"author": "",
"type": "commonjs",
"main": "main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
}
}