Monday, October 19, 2020

Python Multiprocessing Snippet

 You can use a pool of worker processes and a Manager instance to manage access to tf_dict and df_dict dictionaries which are shared between workers:


import glob
import multiprocessing as mp

from utils import get_tokens, update_tf, update_df, save_dicts

WORKERS = 4

def update_dicts(filename, tf_dict, df_dict):
    tokens = get_tokens(filename)

    for tok in tokens:
        update_tf(tok, tf_dict)
        update_df(tok, df_dict)

def main():
    manager = mp.Manager()

    tf_dict = manager.dict()
    df_dict = manager.dict()

    pool = mp.Pool(WORKERS)

    for filename in glob.glob('/path/to/texts/*.txt'):
        pool.apply_async(update_dicts, args=(filename, tf_dict, df_dict))

    pool.close()
    pool.join()

    save_dicts(tf_dict, df_dict)

if __name__ == "__main__":
    main()

No comments: