You can use a pool of worker processes and a Manager instance to manage access to tf_dict and df_dict dictionaries which are shared between workers:
import glob
import multiprocessing as mp
from utils import get_tokens, update_tf, update_df, save_dicts
WORKERS = 4
def update_dicts(filename, tf_dict, df_dict):
tokens = get_tokens(filename)
for tok in tokens:
update_tf(tok, tf_dict)
update_df(tok, df_dict)
def main():
manager = mp.Manager()
tf_dict = manager.dict()
df_dict = manager.dict()
pool = mp.Pool(WORKERS)
for filename in glob.glob('/path/to/texts/*.txt'):
pool.apply_async(update_dicts, args=(filename, tf_dict, df_dict))
pool.close()
pool.join()
save_dicts(tf_dict, df_dict)
if __name__ == "__main__":
main()
No comments:
Post a Comment