Update README.md

fixed a edge case bug where existence check would succed even when it shouldnt when using the trie
2025-06-29 04:21:52 +00:00 · 2025-06-28 22:32:40 +00:00 · 2025-06-28 13:15:10 -04:00 · 2025-06-28 12:53:03 -04:00 · 2025-06-28 16:08:37 +00:00 · 2025-06-28 07:36:46 +00:00
7 changed files with 269 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 # Ignore large dictionary
 kaikki.org-dictionary*
 dictionary/
 # ---> Node
 # Logs
 logs
--- a/README.md
+++ b/README.md
@ -1,4 +1,24 @@
 # valDict
 A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.
-valDict can be served using the built in react server that comes with this repo, or your choice of web server.
+valDict can be served using your choice of web server.
 ## Building
 Firstly, you need the "large_dir" option enabled in your filesystem and "dir_index" will also help speed things up, they can be enabled with
 ```bash
 tune2fs -O large_dir /dev/DEVICE
 tune2fs -O dir_index /dev/DEVICE
 #DEVICE can be found with lsblk
 ```
 IN ADDITION, you need like... ALOT of Inodes like 10 million, i highly recommend making a DEDICATED ext4 partition JUST FOR this dictionary that has an outrageous inode count (remember to set the large_dir and dir_index in the new fs)
 personally i recommend a 40-45GB that is 45,000,000,000 bytes  / 10,000,000 goal inodes = 4,500 bytes per inode (waow that is crazy)
 this is a decently efficent way to patition the format while still leaving a very decent amount of space left for extra files, or whatever future expansion may be required making a filesystem like this would probably require first making an ext4 partition with fdisk, then using mkfs.ext4 -i 3200 to achieve the 3200 bytes per inode ratio...
 building valDict is very easy, just clone the repo and run: `./downloadDict.sh` then run `./cleanbuild.sh` (beware, building takes a LONGGGG time... (has to write 20gb~ worth of files))
 RUN AT OWN RISK BE CAREFUL CAREFUL CAREFUL
--- a/cleanbuild.sh
+++ b/cleanbuild.sh
@ -0,0 +1,3 @@
 rm -rfv ./dictionary/
 node gendictionary
--- a/downloadDict.sh
+++ b/downloadDict.sh
@ -0,0 +1,21 @@
 curl https://gitea.apexfight.net/apex/valDict/releases/download/6%2F24%2F25/kaikki.org-dictionary-English.jsonl.gz | gzip -dc > kaikki.org-dictionary-English.jsonl
 echo calculating sha256 sum of download...
 sha256sum kaikki.org-dictionary-English.jsonl > downloadsum
 # correct sum
 echo 5c8f1d50d4bf93113ffad1b2799d12757b3bf82d29ca5d2fbe1368f05befa91c  kaikki.org-dictionary-English.jsonl > correctsum
 echo Diff of downloaded sum to correct sum:
 diff ./correctsum ./downloadsum
 sleep 1
 echo Cleaning up...
 rm downloadsum correctsum
 echo Size of dictionary:
 du -sh kaikki.org-dictionary-English.jsonl 
--- a/gendictionary.js
+++ b/gendictionary.js
@ -0,0 +1,111 @@
 const fs = require('fs');
 const rl = require('readline')
 const trie = true;
 const trieLevel = 4; //size between cuts for trie ex 4 : "/exam/ple"
 const dictPath = "./dictionary/";
 const language = "en"
 let stream = fs.createReadStream("./kaikki.org-dictionary-English.jsonl", 'utf-8');
 let reader = rl.createInterface(stream);
 const path = dictPath + language + "/";
 fs.mkdirSync(path,{recursive:true});
 var iter = 0;
 reader.on('line', (line) => {
    iter++;
    console.log("Iteration Number: "+iter.toString());
    let entry = JSON.parse(line);
    let thispath = path + getPath(entry.word) + entry.pos + "/";
    console.log(thispath);
    if (!fs.existsSync(thispath + "definitions.json")) {
        initializeDir(thispath);
    }
   // console.log(entry);
    let defintions = writeDefinitions(thispath, entry);
    let sounds = writeSounds(thispath, entry);
    let thesaurus = writeThesaurus(thispath, entry);
 });
 function getPath(word){
    let path = "";
    if (trie){
        for (let i = 0; i < word.length; i+=trieLevel){
            for (let n = 0; n < trieLevel; n++){
                path += word[i+n] ?? "";
            }
            path += "/";
        }
    } else {
        path = word + '/';
    }
    return path.toLowerCase();
 }
 function writeThesaurus(thispath, entry) {
 }
 function writeSounds(thispath, entry) {
    var sounds = JSON.parse(fs.readFileSync(thispath+"sounds.json",'utf-8'));
    if (entry.sounds == null){
        console.log("No sounds key present on entry, skipping sounds def...");
        return;
    }
    entry.sounds.forEach((ele,idx,arr) => {
        if (ele.audio != null) {
            sounds.audios.push(ele.ogg_url, ele.mp3_url);
        } else if (ele.rhymes != null) {
            sounds.rhymes.push(ele.rhymes)
        } else if (ele.homophone != null) {
            sounds.homophones.push(ele.homophone);
        } else {
            sounds.pronunciations.push(ele);
        }
    });
    fs.writeFileSync(thispath+"sounds.json",JSON.stringify(sounds));
    sounds = null;
 }
 function writeDefinitions(thispath, entry) {
    var definitions = JSON.parse(fs.readFileSync(thispath+"definitions.json",'utf-8'));
    let senses = entry.senses;
    if (senses == null){
        console.log("No senses key present on entry, skipping senses def");
        return;
    }
    senses.forEach((ele,idx,arr) => {
        //add glosses onto defnitions list
        definitions.glosses.push(ele.glosses);
    });
    fs.writeFileSync(thispath+"definitions.json", JSON.stringify(definitions));
    definitions = null;
 }
 function initializeDir(path) {
    fs.mkdirSync(path, {recursive:true});
    let definitions = {
        glosses: []
    }
    let sounds = {
        pronunciations: [],
        audios: [],
        rhymes: [],
        homophones: []
    }
    let thesaurus = {
        synonyms: [],
        antynoms: []
    }
    fs.writeFileSync(path+"definitions.json",JSON.stringify(definitions));
    fs.writeFileSync(path+"sounds.json",JSON.stringify(sounds));
    fs.writeFileSync(path+"thesaurus.json",JSON.stringify(thesaurus));
 }
--- a/notes.jsonc
+++ b/notes.jsonc
@ -0,0 +1,93 @@
 /*
    THIS FILE IS NOT VALID JSON, ITS JUST NOTES ABOUT THE EXPECTED 
    STRUCTURE IN OBJECTS LOCATED IN A KAIKKI DICTIONARY
 */
 //relevant object data to extract
 {
    "word": string,
    "lang_code": string, //en, es, pt, etc...
    "pos": string, //part of speech, noun, verb, etc
    "senses": [ //various definitions of the word
        {
            "glosses": [ //actual definition
                string,
            ]
        }
    ],
    "sounds": [
        {
            "tags": [
                //recieved pronunciation -important
                //US
            ],
            "ipa": string
            //OR
            "enpr": string
        },
        {
            "audio": string,
            "ogg_url": URLSTRING
        },
        {
            "rhymes": string
        },
        {
            "homophone": string
        },
    ],
 }
 //PERMUTE TO >
 // {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/definitions.json
 // {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/sounds.json
 //definitions.json
 {
    "glosses": [ //combined list of glosses
    ]
 }
 //sounds.json
 {
    "pronunciations": [ //copies of pronunciation objects from sounds lists
    ],
    "audios": [ //copies of audio objects from sounds lists
    ],
    "rhymes": [ //collections of strings from rhyme key of rhyme object from sounds lists
    ],
    "homophones": [ //same as rhymes but with homphones
    ]
 }
 //thesaurus.json
 {
  synonyms: [
  ],
  antynoms: [
  ]
 }
--- a/package.json
+++ b/package.json
@ -0,0 +1,16 @@
 {
  "name": "valdict",
  "version": "1.0.0",
  "description": "A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.",
  "repository": {
    "type": "git",
    "url": "https://gitea.apexfight.net/apex/valDict.git"
  },
  "license": " GFDL-1.3-or-later",
  "author": "",
  "type": "commonjs",
  "main": "main.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  }
 }
Author	SHA1	Message	Date
apex	cd4838f692	Update README.md	2025-06-29 04:21:52 +00:00
apex	25612ab51c	Update README.md	2025-06-28 22:32:40 +00:00
apex	9df1547fce	fixed a edge case bug where existence check would succed even when it shouldnt when using the trie	2025-06-28 13:15:10 -04:00
apex	6efed770d0	optimized trie and added optional configuration :3	2025-06-28 12:53:03 -04:00
apex	5f485ac73a	Update README.md	2025-06-28 16:08:37 +00:00
apex	6a58b8f48d	Update README.md	2025-06-28 07:36:46 +00:00
apex	9538f78198	Update README.md	2025-06-28 07:32:55 +00:00
apex	4059d6d868	Update README.md	2025-06-28 07:10:02 +00:00
apex	1a2358387f	Update README.md	2025-06-28 07:01:08 +00:00
apex	b4d5966550	path splitting	2025-06-28 02:31:30 -04:00
apex	008126c5f4	Update README.md	2025-06-28 03:43:14 +00:00
apex	7021ab967c	made cleanbuild more verbose	2025-06-25 05:12:09 -04:00
apex	dcb9ae2f68	optimized compiler for GC	2025-06-25 04:32:13 -04:00
apex	e5f1227948	Update README.md	2025-06-25 07:04:10 +00:00
apex	3074606ea5	Update README.md	2025-06-25 06:46:17 +00:00
apex	c5b4b15559	Update README.md	2025-06-25 06:44:25 +00:00
apex	47015d8f40	cleanbuild dictionary script	2025-06-25 02:17:30 -04:00
apex	801c17f4c2	npm manifest	2025-06-25 02:16:35 -04:00
apex	c0a73acd03	notes documenting input and output objects of gendictionary compiler	2025-06-25 02:16:26 -04:00
apex	a7e6424882	code to compile a jsonl file to a complete FS	2025-06-25 02:16:02 -04:00
apex	afabae41d6	dict download scripts w checksum	2025-06-25 02:15:37 -04:00
apex	396b666715	added exclusions to compiled dictionaries	2025-06-25 02:15:14 -04:00