Wordcount

Counts the number of appearances of each word from a set of files.

[1]:
import pycompss.interactive as ipycompss
[2]:
import os
if 'BINDER_SERVICE_HOST' in os.environ:
    ipycompss.start(graph=True, trace=True, debug=False,
                    project_xml='../xml/project.xml',
                    resources_xml='../xml/resources.xml')
else:
    ipycompss.start(graph=True, monitor=1000)  # trace=True
******************************************************
*************** PyCOMPSs Interactive *****************
******************************************************
*          .-~~-.--.           _____       ________  *
*         :         )         |____ \     |____   /  *
*   .~ ~ -.\       /.- ~~ .     ___) |        /  /   *
*   >       `.   .'       <    / ___/        /  /    *
*  (         .- -.         )  | |___   _    /  /     *
*   `- -.-~  `- -'  ~-.- -'   |_____| |_|  /__/      *
*     (        :        )           _ _ .-:          *
*      ~--.    :    .--~        .-~  .-~  }          *
*          ~-.-^-.-~ \_      .~  .-~   .~            *
*                   \ \ '     \ '_ _ -~              *
*                    \`.\`.    //                    *
*           . - ~ ~-.__\`.\`-.//                     *
*       .-~   . - ~  }~ ~ ~-.~-.                     *
*     .' .-~      .-~       :/~-.~-./:               *
*    /_~_ _ . - ~                 ~-.~-._            *
*                                     ~-.<           *
******************************************************
* - Starting COMPSs runtime...                       *
* - Log path : /home/user/.COMPSs/Interactive_01/
* - PyCOMPSs Runtime started... Have fun!            *
******************************************************
[3]:
from pycompss.api.task import task
from pycompss.api.parameter import *
[4]:
@task(returns=dict, file_path=FILE_IN)
def word_count(file_path):
    """
    Read the given file and construct a frequency word dictionary from a list of words.
    :param data: a list of words
    :return: a dictionary where key=word and value=#appearances
    """
    # Read the given file
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data += line.split()
    # Parse the content
    partialResult = {}
    for entry in data:
        if entry in partialResult:
            partialResult[entry] += 1
        else:
            partialResult[entry] = 1
    return partialResult
Found task: word_count
[5]:
@task(returns=dict, priority=True)
def merge_two_dicts(dic1, dic2):
    """
    Update a dictionary with another dictionary.
    :param dic1: first dictionary
    :param dic2: second dictionary
    :return: dic1+=dic2
    """
    for k in dic2:
        if k in dic1:
            dic1[k] += dic2[k]
        else:
            dic1[k] = dic2[k]
    return dic1
Found task: merge_two_dicts

Main wordcount code:

[6]:
from pycompss.api.api import compss_wait_on

# Get the dataset path
path_dataset = os.getcwd() + '/dataset'

# Construct a list with the file's paths from the dataset
partial_result = []
for file_name in os.listdir(path_dataset):
    f = os.path.join(path_dataset, file_name)
    partial_result.append(word_count(f))

# Accumulate the partial results to get the final result.
result = {}
for partial in partial_result:
    result = merge_two_dicts(result, partial)

# Wait for result
result = compss_wait_on(result)

Now lets see the results:

[7]:
from pprint import pprint
print("Result:")
pprint(result)
print("Total words: {}".format(sum(result.values())))
Result:
{'Adipisci': 227,
 'Aliquam': 233,
 'Amet': 207,
 'Consectetur': 201,
 'Dolor': 198,
 'Dolore': 236,
 'Dolorem': 232,
 'Eius': 251,
 'Est': 197,
 'Etincidunt': 232,
 'Ipsum': 228,
 'Labore': 229,
 'Magnam': 195,
 'Modi': 201,
 'Neque': 205,
 'Non': 226,
 'Numquam': 253,
 'Porro': 205,
 'Quaerat': 217,
 'Quiquia': 212,
 'Quisquam': 214,
 'Sed': 225,
 'Sit': 220,
 'Tempora': 189,
 'Ut': 217,
 'Velit': 218,
 'Voluptatem': 235,
 'adipisci': 1078,
 'aliquam': 1107,
 'amet': 1044,
 'consectetur': 1073,
 'dolor': 1120,
 'dolore': 1065,
 'dolorem': 1107,
 'eius': 1048,
 'est': 1101,
 'etincidunt': 1114,
 'ipsum': 1061,
 'labore': 1070,
 'magnam': 1096,
 'modi': 1127,
 'neque': 1093,
 'non': 1099,
 'numquam': 1094,
 'porro': 1101,
 'quaerat': 1086,
 'quiquia': 1079,
 'quisquam': 1144,
 'sed': 1109,
 'sit': 1130,
 'tempora': 1064,
 'ut': 1070,
 'velit': 1105,
 'voluptatem': 1121}
Total words: 35409

Plot the results:

[8]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

words = result.keys()
y_pos = np.arange(len(words))
appearances = result.values()

plt.rcParams['figure.figsize'] = [17, 5]
plt.bar(y_pos, appearances, align='center', alpha=0.5)
plt.grid(axis='y')
plt.xticks(y_pos, words, rotation=90)
plt.ylabel('# appearances')
plt.xlabel('Word')
plt.title('Wordcount')

plt.show()
../_images/Notebooks_wordcount_11_0.png
[9]:
ipycompss.stop()
****************************************************
*************** STOPPING PyCOMPSs ******************
****************************************************
Warning: some of the variables used with PyCOMPSs may
         have not been brought to the master.
****************************************************