/* ========================================================================= *
 * Master thesis "Collaborative filtering, a neural network approach".
 *
 * Counting words over MapReduce
 *
 * - Author: LOUPPE Gilles
 * - Last changes: April 7, 2010
 * ========================================================================= */

#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>

#include <mpi.h>
#include "mapreduce.h"
#include "keyvalue.h"

using namespace std;
using namespace MAPREDUCE_NS;


/* ========================================================================= *
 * MapReduce functions
 * ========================================================================= */

/* ------------------------------------------------------------------------- *
 * Map 1 - Emits (word, 1) pairs
 * ------------------------------------------------------------------------- */

void parse(int i, KeyValue* kv, void* ptr){
    char** files = (char**) ptr;
    char* file = files[i];

    ifstream in(file, ios::in);

    if (in.fail()){ 
        return;
    }

    int one = 1;
    string s;

    while (!in.eof()){
        in >> s;
        transform(s.begin(), s.end(), s.begin(), ::tolower);

        kv->add(const_cast<char*>(s.c_str()), strlen(s.c_str()) + 1, 
                (char*) &one, sizeof(int));
    }

    in.close();
}

/* ------------------------------------------------------------------------- *
 * Reduce 1 - Emits (word, nb_occurences) pairs
 * ------------------------------------------------------------------------- */

void sum(char* key, int key_size,
         char* values, int nb_values, int* value_sizes,
         KeyValue* kv, void* ptr){
    int total = 0;

    for (int i = 0; i < nb_values; i++) {
        int value = *(int*) values;
        total += value;
        values += value_sizes[i];
    }

    kv->add(key, key_size, (char*) &total, sizeof(int));
}

/* ------------------------------------------------------------------------- *
 * Sort 1 - Compare nb_occurences values
 * ------------------------------------------------------------------------- */

int cmp1(char* value1, int size1, char* value2, int size2) {
    int v1 = *(int*) value1;
    int v2 = *(int*) value2;

    if (v1 > v2) return -1;
    else if (v1 < v2) return 1;
    else return 0;
}

/* ------------------------------------------------------------------------- *
 * Map 2 - Emits (word, nb_occurences) pairs for the N first pairs
 * ------------------------------------------------------------------------- */

void first_n(uint64_t i, 
             char* key, int key_size, 
             char* value, int value_size,
             KeyValue* kv, void* ptr){
    int* n = (int*) ptr;
    *n = *n - 1;
    
    if (*n >= 0) {
        kv->add(key, key_size, value, value_size);
    }    
}

/* ------------------------------------------------------------------------- *
 * Map 3 - Output
 * ------------------------------------------------------------------------- */

void output(uint64_t i,
            char* key, int key_size,
            char* value, int value_size,
            KeyValue* kv, void* ptr){
    cout << *(int*) value << "\t" << key << endl;
}


/* ========================================================================= *
 * main
 * ========================================================================= */

int main(int argc, char** argv) {
    // MPI initialization
    MPI_Init(&argc, &argv);

    int rank, np;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &np);

    if (argc <= 1){
        if (rank == 0){
            cerr << "Usage: ./words file1 file2 ..." << endl;
        }

        MPI_Finalize();
        return EXIT_FAILURE;
    }

    // MapReduce initialization
    MapReduce *mr = new MapReduce(MPI_COMM_WORLD);
    mr->verbosity = 0;
    mr->timer = 0;

    // Map: Parse files
    int nb_words = mr->map(argc - 1, &parse, &argv[1]);
    
    // Reduce: Count the number of occurences
    mr->compress(&sum, NULL);
    mr->collate(NULL);
    int nb_uniques = mr->reduce(&sum, NULL);

    // Output
    int n = 50;
    mr->sort_values(&cmp1);
    mr->map(mr, &first_n, &n);

    n = 50;
    mr->gather(1);
    mr->sort_values(&cmp1);
    mr->map(mr, &first_n, &n);
    mr->map(mr, &output, NULL);

    if (rank == 0) {
        cout << "Total words = " << nb_words << endl;
        cout << "Unique words = " << nb_uniques << endl;
    }

    // Cleanup
    delete mr;
    MPI_Finalize();
    
    return EXIT_SUCCESS;
}
