Source code for Calibre_Dataset
import logging
import os
from xml.etree import ElementTree as ET
[docs]
class Calibre_Dataset:
"""A class to access and search text documents from a Calibre library.
:param library_path: Path to the Calibre library
"""
def __init__(self, library_path, verbose=True):
self.log = logging.getLogger("CalibreLib")
library_path = os.path.expanduser(library_path)
if os.path.exists(os.path.join(library_path, "metadata.db")) is False:
raise FileNotFoundError("Calibre library not found at " + library_path)
self.library_path = library_path
self.verbose = verbose
self.records = []
[docs]
def load_index(self, use_aliases=False, max_file_size=None, truncate_large=True):
"""This function loads the Calibre library records that contain text-format books.
:param use_aliases: If True, books are not referenced by title and author,
:param max_file_size: If not None, files larger than max_file_size bytes are ignored or truncated (s.b.)
:param truncate_large: On True, files larger than max_file_size are truncated instead of ignored, only if max_file_size is not None
but by their numeric aliases, thus providing privacy.
"""
# Enumerate all txt-format books
self.records = []
index = 1
for root, dirs, files in os.walk(self.library_path):
for file in files:
if file.endswith(".txt"):
opf_file = os.path.join(root, "metadata.opf")
if os.path.exists(opf_file):
try:
tree = ET.parse(opf_file)
title = tree.find(
".//{http://purl.org/dc/elements/1.1/}title"
).text
author = tree.find(
".//{http://purl.org/dc/elements/1.1/}creator"
).text
language = tree.find(
".//{http://purl.org/dc/elements/1.1/}language"
).text
uuid_element = tree.find(
'.//dc:identifier[@opf:scheme="uuid"]',
namespaces={
"opf": "http://www.idpf.org/2007/opf",
"dc": "http://purl.org/dc/elements/1.1/",
},
)
if uuid_element is not None:
ebook_id = uuid_element.text
else:
self.log.error(
f"Error parsing {opf_file}: No UUID found"
)
continue
except Exception as e:
self.log.error(f"Error parsing {opf_file}: {e}")
continue
filename = os.path.join(root, file)
rec = {
"ebook_id": ebook_id,
"author": author,
"language": language,
"title": title,
"filename": filename,
}
if use_aliases is True:
rec["alias"] = f"CL{index}"
with open(filename, "r", encoding="utf-8") as f:
rec["text"] = f.read()
if max_file_size is not None:
if len(rec["text"]) > max_file_size:
if truncate_large is True:
rec["text"] = rec["text"][:max_file_size]
else:
continue
self.records += [rec]
index += 1
self.log.info(f"Loaded {len(self.records)} records from Calibre library.")
return len(self.records)
[docs]
def search(self, search_dict):
"""Search for book record with key specific key values
For a list of valid keys, use `get_record_keys()`
Standard keys are: `ebook_id`, `author`, `language`, `title`
*Note:* :func:`~Calibre_Dataset.Calibre_Dataset.load_index` needs to be called once before this function can be used.
Example: `search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})`
Find all books whose titles contain at least one of the keywords, language english. Search keys can either be
search for a single keyword (e.g. english), or an array of keywords.
:returns: list of records"""
if not hasattr(self, "records") or self.records is None:
self.log.debug("Index not loaded, trying to load...")
self.load_index()
frecs = []
for rec in self.records:
found = True
for sk in search_dict:
if sk not in rec:
found = False
break
else:
skl = search_dict[sk]
if not isinstance(skl, list):
skl = [skl]
nf = 0
for skli in skl:
if skli.lower() in rec[sk].lower():
nf = nf + 1
if nf == 0:
found = False
break
if found is True:
frecs += [rec]
return frecs