Source code for Folder_Dataset

import logging
import os
import uuid



[docs]
class Folder_Dataset:
    """A class to access and search text documents from a folder."""

    def __init__(self):
        self.log = logging.getLogger("FolderTextLib")
        self.records = []
        self.index = 1


[docs]
    def load_index(
        self,
        folder_path,
        file_extensions=[".txt", ".md", ".py", ".org"],
        max_file_size=None,
        truncate_large=True,
        default_language="English",
        default_author=None,
        use_aliases=False,
    ):
        """This function loads the text files from the folder.

        Load index is additive, new texts are added to already imported texts.

        This loads the text files from a folder and creates some metadata from the filename.
        If `parse_metadata` is `True`, the filename format is expected to be: "Title - Author - Language.txt"

        :param folder_path: Path to a folder containing text files with valid extensions
        :param file_extensions: List of file extensions that identify valid text files
        :param max_file_size: If not None, files larger than max_file_size bytes are ignored or truncated (s.b.)
        :param truncate_large: On True, files larger than max_file_size are truncated instead of ignored, only if max_file_size is not None
        :param default_language: If language is not given via parse_metadata, use None or "English" etc.
        :param default_author: If author is not given via parse_metadata, use None or a specific author that applies to all texts.
        :param use_aliases: If True, documents are not referenced by filename (containing title and author),
        but by their numeric aliases, thus providing privacy.
        """
        folder_path = os.path.expanduser(folder_path)
        cur_index = 0
        if os.path.exists(folder_path) is False:
            raise FileNotFoundError("Folder not found at " + folder_path)
        for root, dirs, files in os.walk(folder_path):
            # Skip hidden stuff:
            files = [f for f in files if not f[0] == "."]
            dirs[:] = [d for d in dirs if not d[0] == "."]
            for file in files:
                # print(file)
                parts = os.path.splitext(file)
                stem = parts[0]
                ext = parts[1]
                if ext in file_extensions:
                    components = stem.split(" - ")
                    if len(components) == 3:
                        title = components[0]
                        author = components[1]
                        language = components[2]
                    elif len(components) == 2:
                        title = components[0]
                        author = components[1]
                        language = default_language
                    else:
                        title = stem
                        author = default_author
                        language = default_language
                    filename = os.path.join(root, file)
                    if ext == ".py":
                        language = "Python"
                    # get a unique ID for the book using a crc from the filename
                    ebook_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, filename))
                    rec = {
                        "ebook_id": ebook_id,
                        "author": author,
                        "language": language,
                        "title": title,
                        "filename": filename,
                    }
                    if use_aliases is True:
                        rec["alias"] = f"FL{self.index}"

                    with open(filename, "r", encoding="utf-8") as f:
                        rec["text"] = f.read()
                    if max_file_size is not None:
                        if len(rec["text"]) > max_file_size:
                            if truncate_large is True:
                                rec["text"] = rec["text"][:max_file_size]
                            else:
                                continue
                    self.records += [rec]
                    self.index = self.index + 1
                    cur_index = cur_index + 1
        self.log.info(
            f"Loaded {cur_index} records from folder, grand total is now {len(self.records)} records."
        )
        return



[docs]
    def get_records(self):
        """Return the imported text records"""
        return self.records



[docs]
    def search(self, search_dict):
        """Search for book record with key specific key values
        For a list of valid keys, use `get_record_keys()`
        Standard keys are: `ebook_id`, `author`, `language`, `title`

        *Note:* :func:`~Folder_Dataset.Folder_Dataset.load_index` needs to be called once before this function can be used.

        Example: `search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})`
        Find all books whose titles contain at least one of the keywords, language english. Search keys can either be
        search for a single keyword (e.g. english), or an array of keywords.

        :returns: list of records"""
        frecs = []
        for rec in self.records:
            found = True
            for sk in search_dict:
                if sk not in rec:
                    found = False
                    break
                else:
                    skl = search_dict[sk]
                    if not isinstance(skl, list):
                        skl = [skl]
                    nf = 0
                    for skli in skl:
                        if skli.lower() in rec[sk].lower():
                            nf = nf + 1
                    if nf == 0:
                        found = False
                        break
            if found is True:
                frecs += [rec]
        return frecs