01st June, 2017

Generating inverted indexes in Python

Generating inverted indexes from text files in Python 3.6.

You can find this project on my GitHub by clicking here.


This program constructs an inverted index from the words contained within a series of text files; the index is generated in memory and them saved to disk immediately. The index is constructed in a way where all files associated with a word are contained within a single text file; in other words, each word has its own text file - each text file contains a JSON string object, which contains a list of objects, which each contain which file contained the word and how many times the word occurred.

Word file structure

The word "hello" found in files: file_1.txt and file_2.txt

  
{"hello": [{"file_name": "file_1.txt", "occurrences": 6}, {"file_name": "file_2.txt", "occurrences": 2}]}
  

The advantage of splitting words into separate files is that it will be much faster to read and load each file into memory when running searches - compared to storing the entire index inside one text file.

For this repo I have uploaded 5 sample text files: hamlet.txt, othello.txt, macbeth.txt, julius-caesar.txt, and romeo-and-juliet.txt - each file contains all text from each play (thanks to http://shakespeare.mit.edu). Using a 2016 MacBook air with a 1.6GHz CPU and 4GB of RAM I was able to index all of these text files in around ~20 seconds - this produced 23,636 word files.

  
{"thou": [{"file_name": "hamlet.txt", "occurrences": 85}, {"file_name": "julius-caesar.txt", "occurrences": 101}, {"file_name": "macbeth.txt", "occurrences": 65}, {"file_name": "othello.txt", "occurrences": 115}, {"file_name": "romeo-and-juliet.txt", "occurrences": 233}]}
  

Fun fact: Romeo and Juliet contains the word "thou" 233 times.

Source code

main.py

  
from file_manager import find_files, create_directory, get_file, check_file, check_directory
from config import create_config, get_config
from indexer import index_files


def main():
    # if config file doesn't exist
    if not check_file("config.txt"):
        # create config file
        create_config()
    # get and unpack config file
    files_directory = get_config("config.txt")['files_directory']
    index_directory = get_config("config.txt")['index_directory']
    extensions = get_config("config.txt")['extensions']
    delimiter = get_config("config.txt")['delimiter']
    encoding = get_config("config.txt")['encoding']
    # if files directory doesn't exist
    if not check_directory(files_directory):
        # create files directory
        create_directory(files_directory)
    # if index directory doesn't exist
    if not check_directory(index_directory):
        # create index directory
        create_directory(index_directory)
    # if there is an existing index
    if len(find_files(index_directory, ["txt"])) > 0:
        # find all files
        files = find_files(index_directory, extensions)
        # load existing index into memory
        index = load_index(index_directory, files)
    # if there are new files to index
    if len(find_files(files_directory, extensions)) > 0:
        # find all files
        files = find_files(files_directory, extensions)
        # index files
        index_files(files_directory, index_directory, delimiter, files, encoding)
    return


if __name__ == "__main__":
    main()
  

file_manager.py

  
from json import loads, dumps
from os import path, listdir, makedirs


def find_files(directory, extensions):
    # initialise files list
    files = []
    # for every file found in directory
    for file in listdir(directory):
        # check file has correct extension
        if file.split(".")[1] in extensions:
            # append file to files list
            files.append(file)
    # return list of files
    return files


def get_file(files_directory, file, encoding):
    # open file
    with open("{0}/{1}".format(files_directory, file), "r", encoding=encoding) as data:
        # return file data
        return data.read()


def create_index_file(index_directory, word, file_data, encoding):
    # create file and open
    with open("{0}/{1}.txt".format(index_directory, word), "w+", encoding=encoding) as data:
        # write data to file
        data.write(file_data)
    return


def append_index_file(index_directory, word, file_data, encoding):
    # open existing index file
    with open("{0}/{1}.txt".format(index_directory, word), "r+", encoding=encoding) as data:
        # load json string into object
        word_object = loads(data.read())
        # for every file associated with word
        for file in word_object[word]:
            # if the file is already associated with the word
            if file['file_name'] == file_data:
                # increment its occurrences by 1
                file['occurrences'] +=1
                # dump the object into json string
                word_object = dumps(word_object)
                # go to beginning of file
                data.seek(0)
                # overwrite existing data
                data.write(word_object)
                return
        # set file data to new object
        file_data = {"file_name": file_data, "occurrences": 1}
        # append the new file data to the existing word object
        word_object[word].append(file_data)
        # dump the object into json string
        word_object = dumps(word_object)
        # go to beginning of file
        data.seek(0)
        # overwrite existing data
        data.write(word_object)
    return


def create_directory(directory):
    # create directory
    makedirs(directory)
    # return success
    return 1


def check_directory(directory):
    # if directory exists
    if path.isdir(directory):
        # return success
        return 1
    # otherwise return nothing
    return


def check_file(file):
    # if file exists
    if path.exists(file):
        # return success
        return 1
    # otherwise return nothing
    return
  

indexer.py

  
from re import sub
from file_manager import get_file, check_file, create_index_file, append_index_file


def index_files(files_directory, index_directory, delimiter, files, encoding):
    # for every file in files
    for file in files:
        # open file and get text
        text = get_file(files_directory, file, encoding)
        # split text by delimiter
        text = text.split(delimiter)
        # for every word in text
        for word in text:
            # sanitise word
            word = sub(r'\W+', '', word).lower()
            # check word has required length
            if len(word) > 2:
                # if word file doesn't exist
                if not check_file("{0}/{1}.txt".format(index_directory, word)):
                    # create json object as string
                    file_data = "{{\"{0}\": [{{\"file_name\": \"{1}\", \"occurrences\": 1}}]}}".format(word, file)
                    # create word file and write data
                    create_index_file(index_directory, word, file_data, encoding)
                else:
                    # otherwise append data
                    append_index_file(index_directory, word, file, encoding)
    return 1
  

config.py

  
from json import loads


def create_config():
    # create config file
    with open("config.txt", "w+") as config:
        # write default values
        config.write("{\n"
                     "  \"files_directory\": \"files\",\n"
                     "  \"index_directory\": \"index\",\n"
                     "  \"extensions\": [\"txt\"],\n"
                     "  \"delimiter\": \" \",\n"
                     "  \"encoding\": \"UTF-8\"\n"
                     "}\n")
        # return success
        return 1


def get_config(file):
    # open file
    with open(file, "rb") as data:
        # set data to value of file
        data = loads(data.read())
        # return data
        return data
  

config.txt

  
{ "files_directory": "files", "index_directory": "index", "extensions": ["txt"], "delimiter": " ", "encoding": "UTF-8"}
  

Thank you for reading.

🌛🌛🌛


Leave a comment

Invalid or missing field(s).
Comment sent successfully, please wait for it to be approved.

This post has no comments