Source code for Folder_Dataset

import logging
import os
import uuid


[docs] class Folder_Dataset: """A class to access and search text documents from a folder.""" def __init__(self): self.log = logging.getLogger("FolderTextLib") self.records = [] self.index = 1
[docs] def load_index( self, folder_path, file_extensions=[".txt", ".md", ".py", ".org"], max_file_size=None, truncate_large=True, default_language="English", default_author=None, use_aliases=False, ): """This function loads the text files from the folder. Load index is additive, new texts are added to already imported texts. This loads the text files from a folder and creates some metadata from the filename. If `parse_metadata` is `True`, the filename format is expected to be: "Title - Author - Language.txt" :param folder_path: Path to a folder containing text files with valid extensions :param file_extensions: List of file extensions that identify valid text files :param max_file_size: If not None, files larger than max_file_size bytes are ignored or truncated (s.b.) :param truncate_large: On True, files larger than max_file_size are truncated instead of ignored, only if max_file_size is not None :param default_language: If language is not given via parse_metadata, use None or "English" etc. :param default_author: If author is not given via parse_metadata, use None or a specific author that applies to all texts. :param use_aliases: If True, documents are not referenced by filename (containing title and author), but by their numeric aliases, thus providing privacy. """ folder_path = os.path.expanduser(folder_path) cur_index = 0 if os.path.exists(folder_path) is False: raise FileNotFoundError("Folder not found at " + folder_path) for root, dirs, files in os.walk(folder_path): # Skip hidden stuff: files = [f for f in files if not f[0] == "."] dirs[:] = [d for d in dirs if not d[0] == "."] for file in files: # print(file) parts = os.path.splitext(file) stem = parts[0] ext = parts[1] if ext in file_extensions: components = stem.split(" - ") if len(components) == 3: title = components[0] author = components[1] language = components[2] elif len(components) == 2: title = components[0] author = components[1] language = default_language else: title = stem author = default_author language = default_language filename = os.path.join(root, file) if ext == ".py": language = "Python" # get a unique ID for the book using a crc from the filename ebook_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, filename)) rec = { "ebook_id": ebook_id, "author": author, "language": language, "title": title, "filename": filename, } if use_aliases is True: rec["alias"] = f"FL{self.index}" with open(filename, "r", encoding="utf-8") as f: rec["text"] = f.read() if max_file_size is not None: if len(rec["text"]) > max_file_size: if truncate_large is True: rec["text"] = rec["text"][:max_file_size] else: continue self.records += [rec] self.index = self.index + 1 cur_index = cur_index + 1 self.log.info( f"Loaded {cur_index} records from folder, grand total is now {len(self.records)} records." ) return
[docs] def get_records(self): """Return the imported text records""" return self.records
[docs] def search(self, search_dict): """Search for book record with key specific key values For a list of valid keys, use `get_record_keys()` Standard keys are: `ebook_id`, `author`, `language`, `title` *Note:* :func:`~Folder_Dataset.Folder_Dataset.load_index` needs to be called once before this function can be used. Example: `search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})` Find all books whose titles contain at least one of the keywords, language english. Search keys can either be search for a single keyword (e.g. english), or an array of keywords. :returns: list of records""" frecs = [] for rec in self.records: found = True for sk in search_dict: if sk not in rec: found = False break else: skl = search_dict[sk] if not isinstance(skl, list): skl = [skl] nf = 0 for skli in skl: if skli.lower() in rec[sk].lower(): nf = nf + 1 if nf == 0: found = False break if found is True: frecs += [rec] return frecs