Compare commits
22 Commits
Author | SHA1 | Date | |
---|---|---|---|
cd4838f692 | |||
25612ab51c | |||
9df1547fce | |||
6efed770d0 | |||
5f485ac73a | |||
6a58b8f48d | |||
9538f78198 | |||
4059d6d868 | |||
1a2358387f | |||
b4d5966550 | |||
008126c5f4 | |||
7021ab967c | |||
dcb9ae2f68 | |||
e5f1227948 | |||
3074606ea5 | |||
c5b4b15559 | |||
47015d8f40 | |||
801c17f4c2 | |||
c0a73acd03 | |||
a7e6424882 | |||
afabae41d6 | |||
396b666715 |
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,3 +1,7 @@
|
|||||||
|
# Ignore large dictionary
|
||||||
|
kaikki.org-dictionary*
|
||||||
|
dictionary/
|
||||||
|
|
||||||
# ---> Node
|
# ---> Node
|
||||||
# Logs
|
# Logs
|
||||||
logs
|
logs
|
||||||
|
22
README.md
22
README.md
@ -1,4 +1,24 @@
|
|||||||
# valDict
|
# valDict
|
||||||
A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.
|
A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.
|
||||||
|
|
||||||
valDict can be served using the built in react server that comes with this repo, or your choice of web server.
|
valDict can be served using your choice of web server.
|
||||||
|
|
||||||
|
|
||||||
|
## Building
|
||||||
|
Firstly, you need the "large_dir" option enabled in your filesystem and "dir_index" will also help speed things up, they can be enabled with
|
||||||
|
```bash
|
||||||
|
|
||||||
|
tune2fs -O large_dir /dev/DEVICE
|
||||||
|
tune2fs -O dir_index /dev/DEVICE
|
||||||
|
|
||||||
|
#DEVICE can be found with lsblk
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
IN ADDITION, you need like... ALOT of Inodes like 10 million, i highly recommend making a DEDICATED ext4 partition JUST FOR this dictionary that has an outrageous inode count (remember to set the large_dir and dir_index in the new fs)
|
||||||
|
|
||||||
|
personally i recommend a 40-45GB that is 45,000,000,000 bytes / 10,000,000 goal inodes = 4,500 bytes per inode (waow that is crazy)
|
||||||
|
this is a decently efficent way to patition the format while still leaving a very decent amount of space left for extra files, or whatever future expansion may be required making a filesystem like this would probably require first making an ext4 partition with fdisk, then using mkfs.ext4 -i 3200 to achieve the 3200 bytes per inode ratio...
|
||||||
|
|
||||||
|
building valDict is very easy, just clone the repo and run: `./downloadDict.sh` then run `./cleanbuild.sh` (beware, building takes a LONGGGG time... (has to write 20gb~ worth of files))
|
||||||
|
RUN AT OWN RISK BE CAREFUL CAREFUL CAREFUL
|
3
cleanbuild.sh
Executable file
3
cleanbuild.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
rm -rfv ./dictionary/
|
||||||
|
|
||||||
|
node gendictionary
|
21
downloadDict.sh
Executable file
21
downloadDict.sh
Executable file
@ -0,0 +1,21 @@
|
|||||||
|
curl https://gitea.apexfight.net/apex/valDict/releases/download/6%2F24%2F25/kaikki.org-dictionary-English.jsonl.gz | gzip -dc > kaikki.org-dictionary-English.jsonl
|
||||||
|
|
||||||
|
|
||||||
|
echo calculating sha256 sum of download...
|
||||||
|
|
||||||
|
sha256sum kaikki.org-dictionary-English.jsonl > downloadsum
|
||||||
|
|
||||||
|
# correct sum
|
||||||
|
echo 5c8f1d50d4bf93113ffad1b2799d12757b3bf82d29ca5d2fbe1368f05befa91c kaikki.org-dictionary-English.jsonl > correctsum
|
||||||
|
|
||||||
|
|
||||||
|
echo Diff of downloaded sum to correct sum:
|
||||||
|
diff ./correctsum ./downloadsum
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
echo Cleaning up...
|
||||||
|
rm downloadsum correctsum
|
||||||
|
|
||||||
|
echo Size of dictionary:
|
||||||
|
du -sh kaikki.org-dictionary-English.jsonl
|
111
gendictionary.js
Normal file
111
gendictionary.js
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
const fs = require('fs');
|
||||||
|
const rl = require('readline')
|
||||||
|
|
||||||
|
const trie = true;
|
||||||
|
const trieLevel = 4; //size between cuts for trie ex 4 : "/exam/ple"
|
||||||
|
|
||||||
|
const dictPath = "./dictionary/";
|
||||||
|
const language = "en"
|
||||||
|
|
||||||
|
let stream = fs.createReadStream("./kaikki.org-dictionary-English.jsonl", 'utf-8');
|
||||||
|
|
||||||
|
let reader = rl.createInterface(stream);
|
||||||
|
|
||||||
|
const path = dictPath + language + "/";
|
||||||
|
|
||||||
|
fs.mkdirSync(path,{recursive:true});
|
||||||
|
var iter = 0;
|
||||||
|
reader.on('line', (line) => {
|
||||||
|
iter++;
|
||||||
|
console.log("Iteration Number: "+iter.toString());
|
||||||
|
let entry = JSON.parse(line);
|
||||||
|
let thispath = path + getPath(entry.word) + entry.pos + "/";
|
||||||
|
console.log(thispath);
|
||||||
|
if (!fs.existsSync(thispath + "definitions.json")) {
|
||||||
|
initializeDir(thispath);
|
||||||
|
}
|
||||||
|
// console.log(entry);
|
||||||
|
|
||||||
|
let defintions = writeDefinitions(thispath, entry);
|
||||||
|
let sounds = writeSounds(thispath, entry);
|
||||||
|
let thesaurus = writeThesaurus(thispath, entry);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
function getPath(word){
|
||||||
|
let path = "";
|
||||||
|
if (trie){
|
||||||
|
for (let i = 0; i < word.length; i+=trieLevel){
|
||||||
|
for (let n = 0; n < trieLevel; n++){
|
||||||
|
path += word[i+n] ?? "";
|
||||||
|
}
|
||||||
|
path += "/";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
path = word + '/';
|
||||||
|
}
|
||||||
|
|
||||||
|
return path.toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
function writeThesaurus(thispath, entry) {
|
||||||
|
|
||||||
|
}
|
||||||
|
function writeSounds(thispath, entry) {
|
||||||
|
var sounds = JSON.parse(fs.readFileSync(thispath+"sounds.json",'utf-8'));
|
||||||
|
|
||||||
|
if (entry.sounds == null){
|
||||||
|
console.log("No sounds key present on entry, skipping sounds def...");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
entry.sounds.forEach((ele,idx,arr) => {
|
||||||
|
if (ele.audio != null) {
|
||||||
|
sounds.audios.push(ele.ogg_url, ele.mp3_url);
|
||||||
|
} else if (ele.rhymes != null) {
|
||||||
|
sounds.rhymes.push(ele.rhymes)
|
||||||
|
} else if (ele.homophone != null) {
|
||||||
|
sounds.homophones.push(ele.homophone);
|
||||||
|
} else {
|
||||||
|
sounds.pronunciations.push(ele);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
fs.writeFileSync(thispath+"sounds.json",JSON.stringify(sounds));
|
||||||
|
sounds = null;
|
||||||
|
}
|
||||||
|
function writeDefinitions(thispath, entry) {
|
||||||
|
|
||||||
|
var definitions = JSON.parse(fs.readFileSync(thispath+"definitions.json",'utf-8'));
|
||||||
|
|
||||||
|
let senses = entry.senses;
|
||||||
|
if (senses == null){
|
||||||
|
console.log("No senses key present on entry, skipping senses def");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
senses.forEach((ele,idx,arr) => {
|
||||||
|
//add glosses onto defnitions list
|
||||||
|
definitions.glosses.push(ele.glosses);
|
||||||
|
});
|
||||||
|
fs.writeFileSync(thispath+"definitions.json", JSON.stringify(definitions));
|
||||||
|
definitions = null;
|
||||||
|
}
|
||||||
|
function initializeDir(path) {
|
||||||
|
fs.mkdirSync(path, {recursive:true});
|
||||||
|
let definitions = {
|
||||||
|
glosses: []
|
||||||
|
}
|
||||||
|
let sounds = {
|
||||||
|
pronunciations: [],
|
||||||
|
audios: [],
|
||||||
|
rhymes: [],
|
||||||
|
homophones: []
|
||||||
|
}
|
||||||
|
let thesaurus = {
|
||||||
|
synonyms: [],
|
||||||
|
antynoms: []
|
||||||
|
}
|
||||||
|
fs.writeFileSync(path+"definitions.json",JSON.stringify(definitions));
|
||||||
|
fs.writeFileSync(path+"sounds.json",JSON.stringify(sounds));
|
||||||
|
fs.writeFileSync(path+"thesaurus.json",JSON.stringify(thesaurus));
|
||||||
|
|
||||||
|
}
|
93
notes.jsonc
Normal file
93
notes.jsonc
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
THIS FILE IS NOT VALID JSON, ITS JUST NOTES ABOUT THE EXPECTED
|
||||||
|
STRUCTURE IN OBJECTS LOCATED IN A KAIKKI DICTIONARY
|
||||||
|
*/
|
||||||
|
//relevant object data to extract
|
||||||
|
{
|
||||||
|
"word": string,
|
||||||
|
"lang_code": string, //en, es, pt, etc...
|
||||||
|
"pos": string, //part of speech, noun, verb, etc
|
||||||
|
"senses": [ //various definitions of the word
|
||||||
|
{
|
||||||
|
"glosses": [ //actual definition
|
||||||
|
string,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sounds": [
|
||||||
|
{
|
||||||
|
"tags": [
|
||||||
|
//recieved pronunciation -important
|
||||||
|
//US
|
||||||
|
],
|
||||||
|
|
||||||
|
"ipa": string
|
||||||
|
//OR
|
||||||
|
"enpr": string
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"audio": string,
|
||||||
|
"ogg_url": URLSTRING
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"rhymes": string
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"homophone": string
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//PERMUTE TO >
|
||||||
|
|
||||||
|
|
||||||
|
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/definitions.json
|
||||||
|
// {{some_path}}/dictionary/{{lang_code}}/{{word}}/{{pos}}/sounds.json
|
||||||
|
|
||||||
|
//definitions.json
|
||||||
|
|
||||||
|
{
|
||||||
|
"glosses": [ //combined list of glosses
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//sounds.json
|
||||||
|
|
||||||
|
{
|
||||||
|
"pronunciations": [ //copies of pronunciation objects from sounds lists
|
||||||
|
|
||||||
|
],
|
||||||
|
|
||||||
|
"audios": [ //copies of audio objects from sounds lists
|
||||||
|
|
||||||
|
],
|
||||||
|
"rhymes": [ //collections of strings from rhyme key of rhyme object from sounds lists
|
||||||
|
|
||||||
|
],
|
||||||
|
"homophones": [ //same as rhymes but with homphones
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//thesaurus.json
|
||||||
|
|
||||||
|
{
|
||||||
|
synonyms: [
|
||||||
|
|
||||||
|
],
|
||||||
|
antynoms: [
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
16
package.json
Normal file
16
package.json
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"name": "valdict",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "A very simple computer readable online dictionary based entirely off of data from **[Wiktionary](https://en.wiktionary.org/wiki/Wiktionary:Main_Page)** that uses data made with **[Wiktextract](https://github.com/tatuylonen/wiktextract)**.",
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://gitea.apexfight.net/apex/valDict.git"
|
||||||
|
},
|
||||||
|
"license": " GFDL-1.3-or-later",
|
||||||
|
"author": "",
|
||||||
|
"type": "commonjs",
|
||||||
|
"main": "main.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user