Source code for Calibre_Dataset

import logging
import os
from xml.etree import ElementTree as ET



[docs]
class Calibre_Dataset:
    """A class to access and search text documents from a Calibre library.

    :param library_path: Path to the Calibre library
    """

    def __init__(self, library_path, verbose=True):
        self.log = logging.getLogger("CalibreLib")
        library_path = os.path.expanduser(library_path)
        if os.path.exists(os.path.join(library_path, "metadata.db")) is False:
            raise FileNotFoundError("Calibre library not found at " + library_path)

        self.library_path = library_path
        self.verbose = verbose
        self.records = []


[docs]
    def load_index(self, use_aliases=False, max_file_size=None, truncate_large=True):
        """This function loads the Calibre library records that contain text-format books.

        :param use_aliases: If True, books are not referenced by title and author,
        :param max_file_size: If not None, files larger than max_file_size bytes are ignored or truncated (s.b.)
        :param truncate_large: On True, files larger than max_file_size are truncated instead of ignored, only if max_file_size is not None
        but by their numeric aliases, thus providing privacy.
        """
        # Enumerate all txt-format books
        self.records = []
        index = 1
        for root, dirs, files in os.walk(self.library_path):
            for file in files:
                if file.endswith(".txt"):
                    opf_file = os.path.join(root, "metadata.opf")
                    if os.path.exists(opf_file):
                        try:
                            tree = ET.parse(opf_file)
                            title = tree.find(
                                ".//{http://purl.org/dc/elements/1.1/}title"
                            ).text
                            author = tree.find(
                                ".//{http://purl.org/dc/elements/1.1/}creator"
                            ).text
                            language = tree.find(
                                ".//{http://purl.org/dc/elements/1.1/}language"
                            ).text
                            uuid_element = tree.find(
                                './/dc:identifier[@opf:scheme="uuid"]',
                                namespaces={
                                    "opf": "http://www.idpf.org/2007/opf",
                                    "dc": "http://purl.org/dc/elements/1.1/",
                                },
                            )
                            if uuid_element is not None:
                                ebook_id = uuid_element.text
                            else:
                                self.log.error(
                                    f"Error parsing {opf_file}: No UUID found"
                                )
                                continue
                        except Exception as e:
                            self.log.error(f"Error parsing {opf_file}: {e}")
                            continue
                        filename = os.path.join(root, file)
                        rec = {
                            "ebook_id": ebook_id,
                            "author": author,
                            "language": language,
                            "title": title,
                            "filename": filename,
                        }
                        if use_aliases is True:
                            rec["alias"] = f"CL{index}"
                        with open(filename, "r", encoding="utf-8") as f:
                            rec["text"] = f.read()
                        if max_file_size is not None:
                            if len(rec["text"]) > max_file_size:
                                if truncate_large is True:
                                    rec["text"] = rec["text"][:max_file_size]
                                else:
                                    continue
                        self.records += [rec]
                        index += 1
        self.log.info(f"Loaded {len(self.records)} records from Calibre library.")
        return len(self.records)



[docs]
    def search(self, search_dict):
        """Search for book record with key specific key values
        For a list of valid keys, use `get_record_keys()`
        Standard keys are: `ebook_id`, `author`, `language`, `title`

        *Note:* :func:`~Calibre_Dataset.Calibre_Dataset.load_index` needs to be called once before this function can be used.

        Example: `search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})`
        Find all books whose titles contain at least one of the keywords, language english. Search keys can either be
        search for a single keyword (e.g. english), or an array of keywords.

        :returns: list of records"""
        if not hasattr(self, "records") or self.records is None:
            self.log.debug("Index not loaded, trying to load...")
            self.load_index()
        frecs = []
        for rec in self.records:
            found = True
            for sk in search_dict:
                if sk not in rec:
                    found = False
                    break
                else:
                    skl = search_dict[sk]
                    if not isinstance(skl, list):
                        skl = [skl]
                    nf = 0
                    for skli in skl:
                        if skli.lower() in rec[sk].lower():
                            nf = nf + 1
                    if nf == 0:
                        found = False
                        break
            if found is True:
                frecs += [rec]
        return frecs