01st June, 2017

Generating inverted indexes in Python

Generating inverted indexes from text files in Python 3.6.

You can find this project on my GitHub by clicking here.

This program constructs an inverted index from the words contained within a series of text files; the index is generated in memory and them saved to disk immediately. The index is constructed in a way where all files associated with a word are contained within a single text file; in other words, each word has its own text file - each text file contains a JSON string object, which contains a list of objects, which each contain which file contained the word and how many times the word occurred.

Word file structure

The word "hello" found in files: file_1.txt and file_2.txt

{"hello": [{"file_name": "file_1.txt", "occurrences": 6}, {"file_name": "file_2.txt", "occurrences": 2}]}

The advantage of splitting words into separate files is that it will be much faster to read and load each file into memory when running searches - compared to storing the entire index inside one text file.

For this repo I have uploaded 5 sample text files: hamlet.txt, othello.txt, macbeth.txt, julius-caesar.txt, and romeo-and-juliet.txt - each file contains all text from each play (thanks to http://shakespeare.mit.edu). Using a 2016 MacBook air with a 1.6GHz CPU and 4GB of RAM I was able to index all of these text files in around ~20 seconds - this produced 23,636 word files.

{"thou": [{"file_name": "hamlet.txt", "occurrences": 85}, {"file_name": "julius-caesar.txt", "occurrences": 101}, {"file_name": "macbeth.txt", "occurrences": 65}, {"file_name": "othello.txt", "occurrences": 115}, {"file_name": "romeo-and-juliet.txt", "occurrences": 233}]}

Fun fact: Romeo and Juliet contains the word "thou" 233 times.

Source code


from file_manager import find_files, create_directory, get_file, check_file, check_directory
from config import create_config, get_config
from indexer import index_files

def main():
    # if config file doesn't exist
    if not check_file("config.txt"):
        # create config file
    # get and unpack config file
    files_directory = get_config("config.txt")['files_directory']
    index_directory = get_config("config.txt")['index_directory']
    extensions = get_config("config.txt")['extensions']
    delimiter = get_config("config.txt")['delimiter']
    encoding = get_config("config.txt")['encoding']
    # if files directory doesn't exist
    if not check_directory(files_directory):
        # create files directory
    # if index directory doesn't exist
    if not check_directory(index_directory):
        # create index directory
    # if there is an existing index
    if len(find_files(index_directory, ["txt"])) > 0:
        # find all files
        files = find_files(index_directory, extensions)
        # load existing index into memory
        index = load_index(index_directory, files)
    # if there are new files to index
    if len(find_files(files_directory, extensions)) > 0:
        # find all files
        files = find_files(files_directory, extensions)
        # index files
        index_files(files_directory, index_directory, delimiter, files, encoding)

if __name__ == "__main__":


from json import loads, dumps
from os import path, listdir, makedirs

def find_files(directory, extensions):
    # initialise files list
    files = []
    # for every file found in directory
    for file in listdir(directory):
        # check file has correct extension
        if file.split(".")[1] in extensions:
            # append file to files list
    # return list of files
    return files

def get_file(files_directory, file, encoding):
    # open file
    with open("{0}/{1}".format(files_directory, file), "r", encoding=encoding) as data:
        # return file data
        return data.read()

def create_index_file(index_directory, word, file_data, encoding):
    # create file and open
    with open("{0}/{1}.txt".format(index_directory, word), "w+", encoding=encoding) as data:
        # write data to file

def append_index_file(index_directory, word, file_data, encoding):
    # open existing index file
    with open("{0}/{1}.txt".format(index_directory, word), "r+", encoding=encoding) as data:
        # load json string into object
        word_object = loads(data.read())
        # for every file associated with word
        for file in word_object[word]:
            # if the file is already associated with the word
            if file['file_name'] == file_data:
                # increment its occurrences by 1
                file['occurrences'] +=1
                # dump the object into json string
                word_object = dumps(word_object)
                # go to beginning of file
                # overwrite existing data
        # set file data to new object
        file_data = {"file_name": file_data, "occurrences": 1}
        # append the new file data to the existing word object
        # dump the object into json string
        word_object = dumps(word_object)
        # go to beginning of file
        # overwrite existing data

def create_directory(directory):
    # create directory
    # return success
    return 1

def check_directory(directory):
    # if directory exists
    if path.isdir(directory):
        # return success
        return 1
    # otherwise return nothing

def check_file(file):
    # if file exists
    if path.exists(file):
        # return success
        return 1
    # otherwise return nothing


from re import sub
from file_manager import get_file, check_file, create_index_file, append_index_file

def index_files(files_directory, index_directory, delimiter, files, encoding):
    # for every file in files
    for file in files:
        # open file and get text
        text = get_file(files_directory, file, encoding)
        # split text by delimiter
        text = text.split(delimiter)
        # for every word in text
        for word in text:
            # sanitise word
            word = sub(r'\W+', '', word).lower()
            # check word has required length
            if len(word) > 2:
                # if word file doesn't exist
                if not check_file("{0}/{1}.txt".format(index_directory, word)):
                    # create json object as string
                    file_data = "{{\"{0}\": [{{\"file_name\": \"{1}\", \"occurrences\": 1}}]}}".format(word, file)
                    # create word file and write data
                    create_index_file(index_directory, word, file_data, encoding)
                    # otherwise append data
                    append_index_file(index_directory, word, file, encoding)
    return 1


from json import loads

def create_config():
    # create config file
    with open("config.txt", "w+") as config:
        # write default values
                     "  \"files_directory\": \"files\",\n"
                     "  \"index_directory\": \"index\",\n"
                     "  \"extensions\": [\"txt\"],\n"
                     "  \"delimiter\": \" \",\n"
                     "  \"encoding\": \"UTF-8\"\n"
        # return success
        return 1

def get_config(file):
    # open file
    with open(file, "rb") as data:
        # set data to value of file
        data = loads(data.read())
        # return data
        return data


{ "files_directory": "files", "index_directory": "index", "extensions": ["txt"], "delimiter": " ", "encoding": "UTF-8"}

Thank you for reading.


Leave a comment

Invalid or missing field(s).
Comment sent successfully, please wait for it to be approved.

This post has no comments