From 55d58a16e2511741cc625e203205dec86144faf3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 18 Feb 2022 20:35:38 -0500 Subject: Reorganized repository layout --- src/bible.cc | 329 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib/libbible.cc | 262 +++++++++++++++++++++++++++++++++++++ src/lib/libbible.h | 115 +++++++++++++++++ src/lib/mods.cc | 233 +++++++++++++++++++++++++++++++++ src/lib/settings.cc | 23 ++++ src/test/Makefile | 20 +++ src/test/modules/JPS.zip | Bin 0 -> 1170889 bytes src/test/modules/KJV.zip | Bin 0 -> 4061008 bytes src/test/testLibbible.cc | 265 ++++++++++++++++++++++++++++++++++++++ src/utf8.h | 34 +++++ src/utf8/checked.h | 327 ++++++++++++++++++++++++++++++++++++++++++++++ src/utf8/core.h | 329 +++++++++++++++++++++++++++++++++++++++++++++++ src/utf8/unchecked.h | 228 ++++++++++++++++++++++++++++++++ 13 files changed, 2165 insertions(+) create mode 100644 src/bible.cc create mode 100644 src/lib/libbible.cc create mode 100644 src/lib/libbible.h create mode 100644 src/lib/mods.cc create mode 100644 src/lib/settings.cc create mode 100644 src/test/Makefile create mode 100644 src/test/modules/JPS.zip create mode 100644 src/test/modules/KJV.zip create mode 100644 src/test/testLibbible.cc create mode 100644 src/utf8.h create mode 100644 src/utf8/checked.h create mode 100644 src/utf8/core.h create mode 100644 src/utf8/unchecked.h (limited to 'src') diff --git a/src/bible.cc b/src/bible.cc new file mode 100644 index 0000000..a09c0c0 --- /dev/null +++ b/src/bible.cc @@ -0,0 +1,329 @@ +#include "lib/libbible.h" +#include +#include +#include +#include +#include +#include +#include +#include "utf8.h" + +using namespace std; + +void usage() { + printf("\nUsage:\n bible [options] [reference]\n\n"); + printf("Print bible passages.\n\n"); + printf("Options:\n"); + printf(" -h, --help display this help message\n"); + printf(" --list-modules list all installed modules\n"); + printf(" -m, --module use specified module\n"); + printf(" --set-default-module use specified module by default in future runs\n"); + printf(" --list-books list books available in the current module\n"); + printf(" --list-chapters list chapters available in book in the current module\n"); + printf(" -o, --omit-verse-numbers when printing verse text, skip printing verse and chapter numbers\n"); + printf(" --list-installable= list bible versions available for download and install. Default lists for all languages.\n"); + printf(" --install-network install module from the network where is LANG:NAME as provided by --list-installable\n"); + printf(" --install-zip install module from a zip file\n"); + printf(" --remove-module delete a module from the system\n"); + printf("\n\nExamples:\n bible Gal 5:22-23\n"); + printf(" bible John 3:16\n bible Romans 12\n bible Matt 5:3-7:27\n"); + printf(" bible Genesis 1-3\n"); +} + +string getDefaultModule() { + return libbible::settingsRead("module"); +} + +void listModules() { + map> mods = libbible::getModules(); + string defaultMod = getDefaultModule(); + printf("Modules Installed:\n"); + for(auto pair : mods) { + if(pair.first == defaultMod) { + printf(" %s (default)\n", pair.first.c_str()); + } else { + printf(" %s\n", pair.first.c_str()); + } + } +} + +void setDefaultModule(string modname) { + libbible::settingsWrite("module", modname); +} + +void listBooks(string modname) { + map> mods = libbible::getModules(); + if(mods.find(modname) == mods.end()) { + printf("ERROR: Module \"%s\" not installed!\n", modname.c_str()); + } else { + printf("Books in Module %s:\n", modname.c_str()); + for(string book : mods[modname]) { + printf(" %s\n", book.c_str()); + } + } +} + +void listChapters(string modname, string book) { + printf("Valid chapters for book %s in module %s:\n", book.c_str(), modname.c_str()); + for(auto pass : libbible::getPassages(modname, book)) { + printf(" Chapter %d, Verses %d-%d\n", pass.chapterStart, pass.verseStart, pass.verseEnd); + } +} + +void listInstallable(string language) { + map> installable = libbible::downloadModsAvailable(); + map languages = libbible::getLanguageNames(); + for(auto pair : installable) { + if(!language.empty() && language != pair.first) { + continue; + } + printf("For language %s:", pair.first.c_str()); + if(!languages[pair.first].empty()) { + printf(" (%s)", languages[pair.first].c_str()); + } + printf("\n"); + for(string name : pair.second) { + printf(" %s\n", name.c_str()); + } + } +} + +void installNetwork(string mod) { + //Split on : + if(mod.find(':') == string::npos) { + printf("Unable to process module \"%s\": Must contain colon separated language:name\n", mod.c_str()); + return; + } + string lang = mod.substr(0, mod.find(':')); + string name = mod.substr(mod.find(':')+1); + if(libbible::installModFromInternet(lang, name)) { + printf("Module installed.\n"); + } else { + printf("Error installing module!\n"); + } +} + +void installZip(string path) { + libbible::installModFromZip(path); +} + +void removeMod(string mod) { + libbible::uninstallMod(mod); +} + +void textWrap(istream& in, ostream& out, size_t width) { + string word; + string line; + char cur = '\0'; + size_t i = 0; + + while(in.get(cur)) { + if(isspace(cur)) { + word.clear(); + } + if(cur == '\n') { + out << line << '\n'; + line.clear(); + word.clear(); + continue; + } + word += cur; + line += cur; + // Anything matching \033.*?m doesn't count + size_t credits = 0; + size_t found = -1; + while((found = line.find("\033", found+1)) != string::npos) { + size_t first = line.find_first_of("m", found); + if(first != string::npos) { + credits += first - found + 1; + } else { + credits += line.size() - found; + } + } + string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); + i = utf8::distance(line.begin(), end_it) - credits; + //printf("Word: %s, i: %ld\n", word.c_str(), i); + if(i > width) { + word.erase(0, word.find_first_not_of(" ")); + if(line.find_last_of(" ") != string::npos) { + line.erase(line.find_last_of(" ")); + out << line << '\n'; + } + line = word; + } + } + out << line; +} + +int main(int argc, char* argv[]) { + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"list-modules", no_argument, 0, 0}, + {"module", required_argument, 0, 'm'}, + {"set-default-module", required_argument, 0, 0}, + {"list-books", no_argument, 0, 0}, + {"list-chapters", required_argument, 0, 0}, + {"omit-verse-numbers", no_argument, 0, 'o'}, + {"list-installable", optional_argument, 0, 0}, + {"install-network", required_argument, 0, 0}, + {"install-zip", required_argument, 0, 0}, + {"remove-module", required_argument, 0, 0} + }; + int opt, option_index; + string modname; + bool omitVerseNums = false; + bool doListBooks = false; + string listChaptersBook; + string option; + while ((opt = getopt_long(argc, argv, "hm:o", long_options, &option_index)) != -1) { + switch(opt) { + case 'h': + usage(); + return 0; + case 'm': + modname = string(optarg); + break; + case 'o': + omitVerseNums = true; + break; + case 0: + option = string(long_options[option_index].name); + if(option == "list-modules") { + listModules(); + return 0; + } else if(option == "set-default-module") { + setDefaultModule(string(optarg)); + } else if(option == "list-books") { + doListBooks = true; + } else if(option == "list-chapters") { + listChaptersBook = string(optarg); + } else if(option == "list-installable") { + if(optarg == nullptr) { + listInstallable(string()); + } else { + listInstallable(string(optarg)); + } + } else if(option == "install-network") { + installNetwork(string(optarg)); + } else if(option == "install-zip") { + installZip(string(optarg)); + } else if(option == "remove-module") { + removeMod(string(optarg)); + } + break; + default: + usage(); + return 1; + } + } + if(modname.empty()) { + modname = getDefaultModule(); + } + if(doListBooks) { + listBooks(modname); + } + if(! listChaptersBook.empty()) { + listChapters(modname, listChaptersBook); + } + string reference; + while(optind < argc) { + reference += argv[optind++]; + reference += " "; + } + if(reference.empty()) { + // That's all. + return 0; + } + + auto text = libbible::getText(libbible::getPassage(modname, reference)); + int chapter = 0; + int verse = 0; + const char* indent = " "; + bool isNewline = true; + stringstream out; + for(auto tex : text) { + if(!omitVerseNums && tex.chapter != chapter) { + out << tex.book << " Chapter " << tex.chapter << ":\n"; + } + bool isParagraph = false; + bool isIndent = false; + bool isDivineName = false; + bool isJesus = false; + bool isTitle = false; + bool isParallel = false; + bool isPreverse = false; + for(string modifier : tex.modifiers) { + if(modifier == "paragraph") { + isParagraph = true; + } else if (modifier == "line indent0") { + isIndent = true; + } else if (modifier == "divineName") { + isDivineName = true; + } else if (modifier == "wordsOfJesus") { + isJesus = true; + } else if (modifier == "title") { + isTitle = true; + } else if (modifier == "parallel") { + isParallel = true; + } else if (modifier == "preverse") { + isPreverse = true; + } + } + if(isPreverse or isTitle or isParallel) { + // Someday maybe we add this, but for now, omit + tex.text = ""; + } + if(isIndent) { + isParagraph = false; + if(isNewline) { + out << indent; + } + } + if(isParagraph) { + out << indent; + } + if(isDivineName) { + transform(tex.text.begin(), tex.text.end(), tex.text.begin(), ::toupper); + } + if(isJesus) { + out << "\033[;31m"; + } + if(omitVerseNums && tex.verse != verse) { + out << " "; + } else if(!omitVerseNums && tex.verse != verse) { + out << " (" << tex.verse << ") "; + } + chapter = tex.chapter; + verse = tex.verse; + out << tex.text; + if(tex.text.back() == '\n') { + isNewline = true; + } else { + isNewline = false; + } + if(isJesus) { + out << "\033[0m"; + } + } + out << "\n"; + + // Get window size + struct winsize size; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &size); + // size.ws_col is number of columns, or 0 if it's a pipe + int cols = size.ws_col; + // If terminal is too small, treat it like a pipe + if(cols < 10) { + cols = 0; + } + + // Now print + if(cols == 0) { + cout << out.str(); + } else { + stringstream out2; + textWrap(out, out2, cols); + cout << out2.str(); + } + return 0; +} diff --git a/src/lib/libbible.cc b/src/lib/libbible.cc new file mode 100644 index 0000000..c9acb7d --- /dev/null +++ b/src/lib/libbible.cc @@ -0,0 +1,262 @@ +#include "libbible.h" +#include +#include +#include +#include +#include +#include + +using namespace sword; +using namespace std; + +SWMgr library(new MarkupFilterMgr(FMT_XHTML)); +OSISFootnotes filter; + +vector getBooks(SWModule *target) { + vector books; + VerseKey *key = (VerseKey *) target->getKey(); + for(char t = 1; t <= key->getTestamentMax(); t++) { + key->setTestament(t); + for(char b = 1; b <= key->getBookMax(); b++) { + key->setBook(b); + // Bug (whose fault??) in JPS; they CLAIM to have two testaments, + // but they only have one, which causes repeats. + if(std::find(books.begin(), books.end(), key->getBookName()) != books.end()) { + continue; + } + // Another issue (maybe bug?) Some translations are NT only, + // but still report OT books/chapters. + if(string(target->renderText()).empty()) { + continue; + } + books.push_back(key->getBookName()); + } + } + return books; +} + +map> libbible::getModules() { + library.load(); + map> mods; + ModMap::iterator it; + for (it = library.getModules().begin(); it != library.getModules().end(); it++) { + string modName = (*it).second->getName(); + SWModule *target = library.getModule(modName.c_str()); + mods[modName] = getBooks(target); + } + return mods; +} + +vector libbible::getPassages(string modName, string book) { + vector passages; + SWModule *target = library.getModule(modName.c_str()); + if(target == nullptr) { + // Module doesn't exist + return passages; + } + target->setKey((book + " " + "1").c_str()); + VerseKey *key = (VerseKey *) target->getKey(); + int maxChapter = key->getChapterMax(); + for(int chapter = 1; chapter <= maxChapter; chapter++) { + string ref = book + ' ' + to_string(chapter); + target->setKey(ref.c_str()); + VerseKey *key = (VerseKey *) target->getKey(); + libbible::passage pass; + pass.modName = modName; + pass.book = string(key->getBookName()); + pass.bookShort = string(key->getBookAbbrev()); + pass.chapterStart = chapter; + pass.chapterEnd = chapter; + pass.verseStart = 1; + pass.verseEnd = key->getVerseMax(); + passages.push_back(pass); + } + return passages; +} + +libbible::text getEmptyText(VerseKey *key) { + libbible::text t; + t.chapter = key->getChapter(); + t.verse = key->getVerse(); + t.book = key->getBookName(); + t.bookShort = key->getBookAbbrev(); + return t; +} + +libbible::passage libbible::getPassage(string modName, string reference) { + libbible::passage pass; + pass.modName = modName; + SWModule *target = library.getModule(pass.modName.c_str()); + if(target == nullptr || reference.empty()) { + // Bad input + return pass; + } + vector validBooks = getBooks(target); + //printf("Hey, I'm inferring missing parts!\n"); + // Let's use the target to help us + target->setKey(reference.c_str()); + VerseKey *key = (VerseKey *) target->getKey(); + pass.book = string(key->getBookName()); + // Hold on a moment, is this book even legal? + if(find(validBooks.begin(), validBooks.end(), pass.book) == validBooks.end()) { + key->setBookName(validBooks[0].c_str()); + pass.book = string(key->getBookName()); + } + pass.bookShort = string(key->getBookAbbrev()); + pass.chapterStart = key->getChapter(); + pass.verseStart = key->getVerse(); + //printf("Results so far: book: %s; chapterStart: %d; verseStart: %d\n", pass.book.c_str(), pass.chapterStart, pass.verseStart); + // And now we just need chapterEnd and verseEnd. Yippee. + string ref = string(reference); + ref.erase(remove(ref.begin(), ref.end(), ' '), ref.end()); + if(ref.find('-') == string::npos) { + // There's no range! + if(ref.find(':') == string::npos) { + // It's a full chapter reference + pass.chapterEnd = pass.chapterStart; + pass.verseEnd = key->getVerseMax(); + } else { + // It's a single verse reference + pass.chapterEnd = pass.chapterStart; + pass.verseEnd = pass.verseStart; + //printf("Hey, it's a single verse reference!\n"); + } + } else { + if(ref.find(':') == string::npos) { + // It's a multi-full-chapter reference + pass.chapterEnd = stoi(ref.substr(ref.find_last_of('-')+1)); + key->setChapter(pass.chapterEnd); + pass.verseEnd = key->getVerseMax(); + } else { + // It falls in categories c:v-v or c:v-c:v (or, technically, c-c:v) + string rangeEnd = ref.substr(ref.find_last_of('-')+1); + if(rangeEnd.find(':') == string::npos) { + // It's c:v-v + pass.verseEnd = stoi(rangeEnd); + pass.chapterEnd = pass.chapterStart; + } else { + // It's c:v-c:v (or c-c:v, but code is the same) + pass.chapterEnd = stoi(rangeEnd.substr(0, rangeEnd.find(':'))); + pass.verseEnd = stoi(rangeEnd.substr(rangeEnd.find(':')+1)); + } + } + } + return pass; +} + +vector libbible::getText(libbible::passage pass) { + vector texts; + SWModule *target = library.getModule(pass.modName.c_str()); + filter.setOptionValue("Off"); + target->addOptionFilter(&filter); + if(target == nullptr) { + // Module doesn't exist + return texts; + } + if(pass.book.empty()) { + pass.book = pass.bookShort; + } + target->setKey((pass.book + + " " + to_string(pass.chapterStart) + + ":" + to_string(pass.verseStart)).c_str()); + VerseKey *key = (VerseKey *) target->getKey(); + + bool endOfParagraph = false; + + string book = string(key->getBookName()); + + for(; string(key->getBookName()) == book && + (key->getChapter() < pass.chapterEnd + || (key->getChapter() == pass.chapterEnd && key->getVerse() <= pass.verseEnd)); + (*key)++) { + + string text = string(target->renderText()); + //printf("Working with: %s\n", text.c_str()); + + texts.push_back(getEmptyText(key)); + + if(key->getVerse() == 1 || endOfParagraph) { + if(find(texts.back().modifiers.begin(), texts.back().modifiers.end(), "paragraph") == texts.back().modifiers.end()) { + texts.back().modifiers.push_back("paragraph"); + } + endOfParagraph = false; + } + + // Variable to accumulate unterminated spans + std::vector> spans; + bool spansChanged = false; + bool hasAddedText = false; + // Iterate over text + for(auto i = text.begin(); i != text.end(); i++) { + if(*i != '<') { + if(spansChanged) { + spansChanged = false; + if(!texts.back().text.empty()) { + texts.push_back(getEmptyText(key)); + } + for(auto& [tag, modifier] : spans) { + if(find(texts.back().modifiers.begin(), texts.back().modifiers.end(), modifier) == texts.back().modifiers.end()) { + texts.back().modifiers.push_back(modifier); + } + } + } + if(*i == '\n') { + continue; // We add newlines with
+ } + if(! hasAddedText && (*i == ' ' || *i == '\t')) { + continue; + } + if(*i == "¶"[0] && i+1 != text.end() && *(i+1) == "¶"[1]) { + i++; + if(hasAddedText) { + texts.back().text += '\n'; + } else { + // Append \n to text in previous texts (if applicable) + if(texts.size() > 1) { + texts[texts.size()-2].text += '\n'; + } + texts.back().modifiers.push_back("paragraph"); + continue; + } + } + texts.back().text += *i; + hasAddedText = true; + } + else { + string span; + for(; i != text.end(); i++) { + span.push_back(*i); + if(*i == '>') { + // The end of the span will be "". + if(span[1] == '/') { + string tag = span.substr(2, span.size()-3); + for(auto rit = spans.rbegin(); rit != spans.rend(); rit++) { + if(rit->first == tag) { + spans.erase(rit.base()-1); + spansChanged = true; + break; + } + } + } else if(span.find("class=\"") != string::npos) { + // The span will be formatted "" + // We want just the NAME + string tag = span.substr(1, span.find(" ")-1); + size_t start = span.find("class=\"")+7; + size_t end = span.find("\"", start); + spans.push_back(std::pair(tag, span.substr(start, end-start))); + spansChanged = true; + } else if(span.find("preverse") != string::npos) { + string tag = span.substr(1, span.find(" ")-1); + spans.push_back(std::pair(tag, "preverse")); + } else if(span == "
" || span == "
") { + texts.back().text += '\n'; + } + break; + } + } + } + } + endOfParagraph = (text[text.length()-1] == '\n'); + } + return texts; +} diff --git a/src/lib/libbible.h b/src/lib/libbible.h new file mode 100644 index 0000000..f77dc8c --- /dev/null +++ b/src/lib/libbible.h @@ -0,0 +1,115 @@ +#include +#include +#include + +namespace libbible { + + struct text { + int chapter; + int verse; + std::string book; + std::string bookShort; + std::string text; + std::vector modifiers; // e.g., paragraph, line indent0, divineName, wordsOfJesus + }; + + struct passage { + std::string modName; + std::string book; + std::string bookShort; + int chapterStart; + int verseStart; + int chapterEnd; + int verseEnd; + }; + + /* + * @return Map of modName to supported books + */ + std::map> getModules(void); + + /* + * @return Vector of valid single full-chapter passages for a book + */ + std::vector getPassages(std::string modName, std::string book); + + /* + * @param modName the module to use for determining the passage + * @param reference a human-readable reference, e.g., "gen 1:26-27" + * @return the passage matching the reference + */ + passage getPassage(std::string modName, std::string reference); + + /* + * @return Text for a passage + */ + std::vector getText(struct passage pass); + + /************************** + * Methods dealing with mods + ***************************/ + + class Status { + public: + virtual void update(unsigned long totalBytes, unsigned long completedBytes, std::string message) {} + }; + + /** + * @param status Status update method is called asynchronously as download progresses + */ + void setStatusReporter(Status& status); + + /** + * @return A mapping from language to bible version names + */ + std::map> downloadModsAvailable(); + + /** + * @return A mapping from language abbreviations to full language names + */ + std::map getLanguageNames(); + + /** + * Cancel an in-progress download + */ + void terminateDownload(void); + + /** + * @param language The language of the mod to install as provided from downloadModsAvailable + * @param name The name of the bible version as provided from downloadModsAvailable + * @see downloadModsAvailable() + * @return true on success, false otherwise + */ + bool installModFromInternet(std::string language, std::string name); + + /** + * @param filename Path to the .zip compressed module to be installed + * @return true on success, false otherwise + */ + bool installModFromZip(std::string filename); + + /** + * @param modname The name of the module to be removed + */ + void uninstallMod(std::string modname); + + /****************************** + * Methods dealing with settings + *******************************/ + + /* + * From already established code, valid and useful values are: + * int fontsize: the last used size of the font + * string passage: the last looked-up passage + * string module: the last used module + */ + + void settingsWrite(std::string key, std::string value); + + std::string settingsRead(std::string key); + + void settingsWriteInt(std::string key, int value); + + int settingsReadInt(std::string key); + +} diff --git a/src/lib/mods.cc b/src/lib/mods.cc new file mode 100644 index 0000000..ab54e48 --- /dev/null +++ b/src/lib/mods.cc @@ -0,0 +1,233 @@ +#include "libbible.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +class myStatusReporter : public sword::StatusReporter { + public: + myStatusReporter(libbible::Status *status); + ~myStatusReporter(); + void preStatus(long totalBytes, long completedBytes, const char *message); + void update(unsigned long totalBytes, unsigned long completedBytes); + protected: + libbible::Status *status; + string message; +}; + +myStatusReporter::myStatusReporter(libbible::Status *s) { + status = s; +} + +myStatusReporter::~myStatusReporter() {}; + +//virtual void libbible::Status::update(unsigned long totalBytes, unsigned long completedBytes, string message) {} + +void myStatusReporter::preStatus(long totalBytes, long completedBytes, const char *msg) { + message = string(msg); + status->update((unsigned long) totalBytes, (unsigned long) completedBytes, message); + //printf("Got a status update: %ld / %ld, \"%s\"\n", completedBytes, totalBytes, message.c_str()); +} + +void myStatusReporter::update(unsigned long totalBytes, unsigned long completedBytes) { + status->update(totalBytes, completedBytes, message); + //printf("Got a status update: %ld / %ld, \"%s\"\n", completedBytes, totalBytes, message.c_str()); +} + +string basedir = (getenv("HOME")) + string("/.sword/"); +sword::InstallMgr *installMgr = new sword::InstallMgr((basedir + std::string("InstallMgr")).c_str(), nullptr); +map>> installSources; +map languageNames; // maps abbreviation to full name + +void libbible::setStatusReporter(libbible::Status& status) { + myStatusReporter *msr = new myStatusReporter(&status); + free(installMgr); + installMgr = new sword::InstallMgr((basedir + std::string("InstallMgr")).c_str(), msr); + installMgr->setUserDisclaimerConfirmed(true); +} + +map> libbible::downloadModsAvailable() { + installSources.clear(); + languageNames.clear(); + mkdir((basedir + std::string("mods.d/")).c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + mkdir((basedir + std::string("modules/")).c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + installMgr->setUserDisclaimerConfirmed(true); + string confpath = basedir + string("InstallMgr/InstallMgr.conf"); + if(! sword::FileMgr::existsFile(confpath.c_str())) { + // Lifted directly from xiphos + sword::FileMgr::createParent(confpath.c_str()); + sword::SWConfig config(confpath.c_str()); + sword::InstallSource is("FTP"); + is.caption = "CrossWire"; + is.source = "ftp.crosswire.org"; + is.directory = "/pub/sword/raw"; + config["General"]["PassiveFTP"] = "true"; + config["Sources"]["FTPSource"] = is.getConfEnt(); + config.save(); + installMgr->refreshRemoteSourceConfiguration(); + } + installMgr->readInstallConf(); + map> modsAvailable; + map> languagesToFull; + //printf("Getting langs...\n"); + for(auto src : installMgr->sources) { + if(src.second->getMgr()->Modules.empty()) { + //printf("Refreshing remote source: %s\n", src.second->getConfEnt().c_str()); + installMgr->refreshRemoteSource(src.second); + } + for(auto mod : src.second->getMgr()->Modules) { + auto *curMod = mod.second; + string type(curMod->getType()); + if(type == "Biblical Texts") { + string language(curMod->getLanguage()); + string fullLang; + if(curMod->getConfigEntry("LCSH")) { + // Split on periods, last field, strip + fullLang = string(curMod->getConfigEntry("LCSH")); + // If ends with ., remove + if(fullLang.ends_with('.')) fullLang = fullLang.substr(0, fullLang.size()-1); + if(fullLang.find('.') != string::npos) fullLang = fullLang.substr(fullLang.find_last_of('.')+1); + while(fullLang.starts_with(' ')) fullLang = fullLang.substr(1); + while(fullLang.ends_with(' ')) fullLang = fullLang.substr(0, fullLang.size()-1); + } + vector newLangs; + languagesToFull.emplace(language, newLangs); + languagesToFull[language].push_back(fullLang); + vector newMods; + vector> newSources; + // emplace only adds if key is unique + modsAvailable.emplace(language, newMods); + installSources.emplace(language, newSources); + modsAvailable[language].push_back(string(curMod->getName())); + pair p(string(curMod->getName()), src.second); + installSources[language].push_back(p); + } + } + } + // Now use majority voting to move languagesToFull -> languageNames + for(const auto& [abbrev, fulls] : languagesToFull) { + std::map majVote; + for(auto full : fulls) { + majVote.try_emplace(full, 0); + majVote[full]++; + } + string selected = fulls[0]; + for(auto full : fulls) { + if(majVote[full] > majVote[selected] or (majVote[full] == majVote[selected] and !full.empty() and full.size() < selected.size())) { + selected = full; + } + } + if(selected.empty()) languageNames[abbrev] = abbrev; + else languageNames[abbrev] = selected; + } + return modsAvailable; +} + +std::map libbible::getLanguageNames() { + if(languageNames.empty()) { + downloadModsAvailable(); + } + return languageNames; +} + +void libbible::terminateDownload() { + installMgr->terminate(); +} + +bool libbible::installModFromInternet(string language, string name) { + // Searching through map>> installSources; + if(installSources.empty()) { + downloadModsAvailable(); + } + for (pair p : installSources[language]) { + if(p.first == name) { + sword::SWMgr mgr(basedir.c_str()); + if(installMgr->installModule(&mgr, 0, name.c_str(), p.second) == 0) { + printf("Installed from %s\n", p.second->getConfEnt().c_str()); + return true; + } + return false; + } + } + return false; +} + +#define READ_SIZE 8192 +#define delim '/' + +bool libbible::installModFromZip(string filename) { + // So... turns out it's a mite unsupported to install from a .zip + // Here's the deal. We do a syscall to unzip. We fancy like that. + // TODO: Use the ZipCompress module from SWORD instead. + /*string command = "unzip -o " + filename + " -d " + basedir + "&> /dev/null"; + if(system(command.c_str())) { + //Uh oh... + printf("Something bad happened when unpacking %s\n. Is unzip installed?", filename.c_str()); + }*/ + unzFile zipfile = unzOpen(filename.c_str()); + if(zipfile == NULL) { + return false; + } + unz_global_info global_info; + if(unzGetGlobalInfo(zipfile, &global_info) != UNZ_OK) { + unzClose(zipfile); + return false; + } + char read_buffer[READ_SIZE]; + ulong i; + for(i = 0; i < global_info.number_entry; i++) { + unz_file_info file_info; + if(unzGetCurrentFileInfo(zipfile, &file_info, read_buffer, READ_SIZE, NULL, 0, NULL, 0) != UNZ_OK) { + unzClose(zipfile); + return false; + } + string fname = basedir + string(read_buffer); + size_t pos = fname.find_last_of(delim); + if(pos != string::npos) { + string path = fname.substr(0, pos); + filesystem::create_directories(path); + } + if(unzOpenCurrentFile(zipfile) != UNZ_OK) { + unzCloseCurrentFile(zipfile); + unzClose(zipfile); + return false; + } + FILE *out = fopen(fname.c_str(), "wb"); + if(out == NULL) { + unzCloseCurrentFile(zipfile); + unzClose(zipfile); + return false; + } + int bytesRead; + do { + bytesRead = unzReadCurrentFile(zipfile, read_buffer, READ_SIZE); + if(bytesRead < 0) { + printf("error %d\n", bytesRead); + unzCloseCurrentFile(zipfile); + unzClose(zipfile); + return false; + } + if(bytesRead > 0) { + fwrite(read_buffer, bytesRead, 1, out); + } + } while(bytesRead > 0); + fclose(out); + unzCloseCurrentFile(zipfile); + unzGoToNextFile(zipfile); + } + unzClose(zipfile); + return true; +} + +void libbible::uninstallMod(string modname) { + sword::SWMgr mgr(basedir.c_str()); + sword::ModMap::iterator it = mgr.Modules.find(modname.c_str()); + if(it != mgr.Modules.end()) { + installMgr->removeModule(&mgr, it->second->getName()); + } +} diff --git a/src/lib/settings.cc b/src/lib/settings.cc new file mode 100644 index 0000000..848e22f --- /dev/null +++ b/src/lib/settings.cc @@ -0,0 +1,23 @@ +#include "libbible.h" +#include + +std::string path = (std::getenv("HOME")) + std::string("/.sword/libbible.conf"); +sword::SWConfig config(path.c_str()); + +void libbible::settingsWrite(std::string key, std::string value) { + config["General"][key.c_str()] = sword::SWBuf(value.c_str()); + config.save(); +} + +std::string libbible::settingsRead(std::string key) { + return config["General"][key.c_str()].c_str(); +} + +void libbible::settingsWriteInt(std::string key, int value) { + config["General"][key.c_str()] = sword::SWBuf(std::to_string(value).c_str()); + config.save(); +} + +int libbible::settingsReadInt(std::string key) { + return atoi(config["General"][key.c_str()].c_str()); +} diff --git a/src/test/Makefile b/src/test/Makefile new file mode 100644 index 0000000..1f8bc8b --- /dev/null +++ b/src/test/Makefile @@ -0,0 +1,20 @@ +LIBS = sword minizip +override CXXFLAGS += -MMD -Wall -fPIC -std=c++20 `pkg-config $(LIBS) --cflags` +override LDFLAGS += -lstdc++fs `pkg-config $(LIBS) --libs` -lcppunit ../../libbible.so +SOURCES = $(wildcard *.cc) +OBJECTS = $(SOURCES:.cc=.o) +DEPS = $(OBJECTS:.o=.d) +TEST = testLibbible + +$(TEST): $(OBJECTS) + $(CXX) $(OBJECTS) -o $@ $(LDFLAGS) + +-include $(DEPS) + +.PHONY: test +test: $(TEST) + ./$(TEST) + +.PHONY: clean +clean: + $(RM) $(OBJECTS) $(DEPS) $(TEST) diff --git a/src/test/modules/JPS.zip b/src/test/modules/JPS.zip new file mode 100644 index 0000000..4f09ff8 Binary files /dev/null and b/src/test/modules/JPS.zip differ diff --git a/src/test/modules/KJV.zip b/src/test/modules/KJV.zip new file mode 100644 index 0000000..27c161d Binary files /dev/null and b/src/test/modules/KJV.zip differ diff --git a/src/test/testLibbible.cc b/src/test/testLibbible.cc new file mode 100644 index 0000000..d3a265a --- /dev/null +++ b/src/test/testLibbible.cc @@ -0,0 +1,265 @@ +//#include +#include "libbible.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace CppUnit; +using namespace std; + +//----------------------------------------------------------------------------- + +class TestLibbible : public CppUnit::TestFixture +{ + CPPUNIT_TEST_SUITE(TestLibbible); + CPPUNIT_TEST(testGetModules); + CPPUNIT_TEST(testGetPassages); + CPPUNIT_TEST(testGetText); + CPPUNIT_TEST(testSettings); + CPPUNIT_TEST(testDownload); + CPPUNIT_TEST_SUITE_END(); + + //public: + //void setUp(void); + //void tearDown(void); + + protected: + void testGetModules(void); + void testGetPassages(void); + void testGetText(void); + void testSettings(void); + void testDownload(void); + +}; + +//----------------------------------------------------------------------------- + +class StatusTester : public libbible::Status +{ + public: + virtual void update(unsigned long totalBytes, unsigned long completedBytes, string message); + bool hasBeenUpdated = false; +}; + +void StatusTester::update(unsigned long totalBytes, unsigned long completedBytes, string message) { + hasBeenUpdated = true; +} + +//----------------------------------------------------------------------------- + +class CancelTester : public libbible::Status +{ + public: + virtual void update(unsigned long totalBytes, unsigned long completedBytes, string message); +}; + +void CancelTester::update(unsigned long totalBytes, unsigned long completedBytes, string message) { + libbible::terminateDownload(); +} + +//----------------------------------------------------------------------------- + +void TestLibbible::testGetModules(void) { + map> mods = libbible::getModules(); + for(auto pair : mods) { + libbible::uninstallMod(pair.first); + } + CPPUNIT_ASSERT(libbible::getModules().empty()); + CPPUNIT_ASSERT(libbible::installModFromZip("modules/KJV.zip")); + CPPUNIT_ASSERT(libbible::installModFromZip("modules/JPS.zip")); + mods = libbible::getModules(); + CPPUNIT_ASSERT(mods.find("KJV") != mods.end()); + CPPUNIT_ASSERT(mods["KJV"].size() == 66); + CPPUNIT_ASSERT(mods["KJV"][7] == "Ruth"); + CPPUNIT_ASSERT(mods["KJV"][42] == "John"); + CPPUNIT_ASSERT(mods.find("JPS") != mods.end()); + CPPUNIT_ASSERT(mods["JPS"].size() == 39); +} + +void TestLibbible::testGetPassages(void) { + auto passages = libbible::getPassages("KJV", "Romans"); + CPPUNIT_ASSERT(passages[0].modName == "KJV"); + CPPUNIT_ASSERT(passages[0].book == "Romans"); + CPPUNIT_ASSERT(passages[0].bookShort == "Rom"); + CPPUNIT_ASSERT(passages[0].chapterStart == 1); + CPPUNIT_ASSERT(passages[0].verseStart == 1); + CPPUNIT_ASSERT(passages[0].chapterEnd == 1); + CPPUNIT_ASSERT(passages[0].verseEnd == 32); + CPPUNIT_ASSERT(passages.size() == 16); +} + +vector> getChapVerses(std::vector text) { + vector> chapVerses; + for(auto tex : text) { + //printf("Text is: `%s`\n", tex.text.c_str()); + //for(auto modifier : tex.modifiers) { + // printf("\tModifiers include: %s\n", modifier.c_str()); + //} + if(chapVerses.empty() || + chapVerses.back().first != tex.chapter || + chapVerses.back().second != tex.verse) { + chapVerses.push_back(pair(tex.chapter, tex.verse)); + } + } + return chapVerses; +} + +void TestLibbible::testGetText(void) { + libbible::passage pass; + pass.modName = "KJV"; + pass.bookShort = "Matt"; + pass.chapterStart = 3; + pass.verseStart = 16; + pass.chapterEnd = 4; + pass.verseEnd = 7; + auto text = libbible::getText(pass); + // Verify that it includes every verse (3:16-17 + 4:1-7) + vector> chapVerses = getChapVerses(text); + vector> shouldContain = vector>({pair(3, 16), + pair(3, 17), + pair(4, 1), + pair(4, 2), + pair(4, 3), + pair(4, 4), + pair(4, 5), + pair(4, 6), + pair(4, 7)}); + CPPUNIT_ASSERT(chapVerses == shouldContain); + libbible::passage pass2; + pass2.modName = "KJV"; + pass2.book = "John"; + pass2.chapterStart = 3; + pass2.verseStart = 16; + pass2.chapterEnd = 3; + pass2.verseEnd = 16; + text = libbible::getText(pass2); + string allText; + for(auto tex : text) { + allText += tex.text; + } + //printf("Text is: `%s`\n", allText.c_str()); + CPPUNIT_ASSERT(allText == "For God so loved the world, that he gave his only begotten Son, that whosoever believeth in him should not perish, but have everlasting life. "); + + text = libbible::getText(libbible::getPassage("KJV", "John 3:3")); + allText.clear(); + for(auto tex : text) { + allText += tex.text; + } + //printf("Text is: `%s`\n", allText.c_str()); + CPPUNIT_ASSERT(allText == "Jesus answered and said unto him, Verily, verily, I say unto thee, Except a man be born again, he cannot see the kingdom of God. "); + + text = libbible::getText(libbible::getPassage("KJV", "Gal 5:22-23")); + chapVerses = getChapVerses(text); + shouldContain = vector>({pair(5, 22), pair(5, 23)}); + CPPUNIT_ASSERT(chapVerses == shouldContain); + + text = libbible::getText(libbible::getPassage("KJV", "1 cor 1:31-2:1")); + chapVerses = getChapVerses(text); + shouldContain = vector>({pair(1, 31), pair(2, 1)}); + CPPUNIT_ASSERT(chapVerses == shouldContain); + + text = libbible::getText(libbible::getPassage("KJV", "ps 14-15")); + chapVerses = getChapVerses(text); + shouldContain = vector>({pair(14, 1), + pair(14, 2), + pair(14, 3), + pair(14, 4), + pair(14, 5), + pair(14, 6), + pair(14, 7), + pair(15, 1), + pair(15, 2), + pair(15, 3), + pair(15, 4), + pair(15, 5)}); + CPPUNIT_ASSERT(chapVerses == shouldContain); + + text = libbible::getText(libbible::getPassage("KJV", "John 21")); + CPPUNIT_ASSERT(text.back().verse == 25); +} + +void TestLibbible::testSettings(void) { + libbible::settingsWrite("test", "foo"); + CPPUNIT_ASSERT(libbible::settingsRead("test") == "foo"); + libbible::settingsWrite("test", "bar"); + CPPUNIT_ASSERT(libbible::settingsRead("test") == "bar"); + libbible::settingsWriteInt("test", 5); + CPPUNIT_ASSERT(libbible::settingsReadInt("test") == 5); + libbible::settingsWrite("test", ""); + CPPUNIT_ASSERT(libbible::settingsRead("test") == ""); +} + +void TestLibbible::testDownload(void) { + map> modsAvailable = libbible::downloadModsAvailable(); + // We try installing the first available one + string language; + string name; + for(auto pair : modsAvailable) { + language = pair.first; + name = pair.second[0]; + break; + } + CPPUNIT_ASSERT(!language.empty() && !name.empty()); + // Try uninstalling it (shouldn't crash or have nasty side effects!) + libbible::uninstallMod(name); + // Try installing it with cancel. Shoudn't work because it gets cancelled! + CancelTester cancel; + libbible::setStatusReporter(cancel); + libbible::installModFromInternet(language, name); + auto mods = libbible::getModules(); + CPPUNIT_ASSERT(mods.find(name) == mods.end()); + // Now we try with normal status + StatusTester status; + libbible::setStatusReporter(status); + libbible::installModFromInternet(language, name); + mods = libbible::getModules(); + CPPUNIT_ASSERT(mods.find(name) != mods.end()); + CPPUNIT_ASSERT(status.hasBeenUpdated); + +} +//----------------------------------------------------------------------------- + +CPPUNIT_TEST_SUITE_REGISTRATION( TestLibbible ); + +int main(int argc, char* argv[]) { + // informs test-listener about testresults + CPPUNIT_NS::TestResult testresult; + + // register listener for collecting the test-results + CPPUNIT_NS::TestResultCollector collectedresults; + testresult.addListener (&collectedresults); + + // register listener for per-test progress output + CPPUNIT_NS::BriefTestProgressListener progress; + testresult.addListener (&progress); + + // insert test-suite at test-runner by registry + CPPUNIT_NS::TestRunner testrunner; + testrunner.addTest (CPPUNIT_NS::TestFactoryRegistry::getRegistry().makeTest ()); + testrunner.run(testresult); + + // output results in compiler-format + CPPUNIT_NS::CompilerOutputter compileroutputter(&collectedresults, std::cerr); + compileroutputter.write (); + + // Output XML for Jenkins CPPunit plugin + //ofstream xmlFileOut("testLibbibleResults.xml"); + //XmlOutputter xmlOut(&collectedresults, xmlFileOut); + //xmlOut.write(); + + // return 0 if tests were successful + return collectedresults.wasSuccessful() ? 0 : 1; +} diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..4e44514 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/src/utf8/checked.h b/src/utf8/checked.h new file mode 100644 index 0000000..1331155 --- /dev/null +++ b/src/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (utf8::internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return utf8::next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + utf8::next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start != end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/src/utf8/core.h b/src/utf8/core.h new file mode 100644 index 0000000..693d388 --- /dev/null +++ b/src/utf8/core.h @@ -0,0 +1,329 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/src/utf8/unchecked.h b/src/utf8/unchecked.h new file mode 100644 index 0000000..cb24271 --- /dev/null +++ b/src/utf8/unchecked.h @@ -0,0 +1,228 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return utf8::unchecked::prior(it); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + utf8::unchecked::next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + -- cgit v1.2.3