Algorithms to Compute the Difference of Two Maps


When unit testing, you often want to assert two things are equal. This is common for maps (dictionaries), where you may have a return map and the expected one. For example:

1
assertEquals(Map("one" -> 1), Map("one" -> 2));
assertEquals(Map("one" -> 1), Map("one" -> 2));

would throw an error with a message along the lines of:

1
Map("one" -> 1) did not equal Map("one" -> 2)
Map("one" -> 1) did not equal Map("one" -> 2)

Printing out the contents of both maps makes it hard for the caller to find the actual differences they care about, especially in large maps. It’s hard to see the difference in the following output:

1
2
Map("six" -> 6, "four" --> 4, "two" --> 2, "one" --> 1, "three" --> 3, "five" --> 5) did not equal 
Map("three" --> 3, "one" --> 1, "two" --> 2, "four" --> 5, "six" --> 6, "five" --> 5)"
Map("six" -> 6, "four" --> 4, "two" --> 2, "one" --> 1, "three" --> 3, "five" --> 5) did not equal 
Map("three" --> 3, "one" --> 1, "two" --> 2, "four" --> 5, "six" --> 6, "five" --> 5)"

Our goal is to write a function that computes the differences between two maps and returns the result in a data structure of your choosing. This should make it easy for the caller to assert equality and print the differences in a meaningful way. The function should be easy to use and well documented. Do not print a string; writing a helper function to render our result into a String is outside of the scope of this question.

Compute the Difference of Two Maps

Considering refactor the MapDiff class so that the difference between HashMaps A and B, B and A can be simply calling a same utility function. We return the difference type in a Enum.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <unordered_map>
#include <vector>
 
using namespace std;
 
enum DiffType {
    VALUE,
    MINUS,
    PLUS
};
 
template <class T, class V>
// typedef vector<tuple<DiffType, T, V, V> ArrayOfDiffs;
 
template <class T, class V>
class MapDiff {
public:
    MapDiff(const unordered_map<T, V> &m) {
        data = m;
    }
    
    vector<tuple<DiffType, T, V, V>> getDiff(const unordered_map<T, V> &anotherMap) {
        auto diffs = getMapDiff(data, anotherMap);
        auto anotherDiffs = getMapDiff(anotherMap, data);
        vector<tuple<DiffType, T, V, V>> result(diffs.first);
        result.insert(end(result), begin(diffs->second), end(diffs->second));
        result.insert(end(result), begin(anotherDiffs->second), end(anotherDiffs->second));
        return result;
    }
    
private:
    unordered_map<T, V> data;
    
    // N entries for map1, M for map2
    // Time: O(N + M)
    // Space: O(N + M)
    pair<vector<tuple<DiffType, T, V, V>>, vector<tuple<DiffType, T, V, V>>> getMapDiff(
        const unordered_map<T, V> &map1, const unordered_map<T, V> &map2) {
        vector<tuple<DiffType, T, V, V>> diffWithSameKey;
        vector<tuple<DiffType, T, V, V>> otherDiffs;
        for (auto it = begin(map1); it != end(map1); ++ it) {
            auto key = it->first;
            auto value = it->second;
            if (map2.count(key)) {
                // if the key appears in both map
                if (value != map2[key]) {
                    diffWithSameKey.push_back(make_tuple(VALUE, key, value, map2[key]));
                }
            } else {
                // second map doesn't have this key
                otherDiffs.push_back(make_tuple(PLUS, key, value, value));
            }
        }        
        return {diffWithSameKey, otherDiffs};
    }    
};
#include <unordered_map>
#include <vector>

using namespace std;

enum DiffType {
    VALUE,
    MINUS,
    PLUS
};

template <class T, class V>
// typedef vector<tuple<DiffType, T, V, V> ArrayOfDiffs;

template <class T, class V>
class MapDiff {
public:
    MapDiff(const unordered_map<T, V> &m) {
        data = m;
    }
    
    vector<tuple<DiffType, T, V, V>> getDiff(const unordered_map<T, V> &anotherMap) {
        auto diffs = getMapDiff(data, anotherMap);
        auto anotherDiffs = getMapDiff(anotherMap, data);
        vector<tuple<DiffType, T, V, V>> result(diffs.first);
        result.insert(end(result), begin(diffs->second), end(diffs->second));
        result.insert(end(result), begin(anotherDiffs->second), end(anotherDiffs->second));
        return result;
    }
    
private:
    unordered_map<T, V> data;
    
    // N entries for map1, M for map2
    // Time: O(N + M)
    // Space: O(N + M)
    pair<vector<tuple<DiffType, T, V, V>>, vector<tuple<DiffType, T, V, V>>> getMapDiff(
        const unordered_map<T, V> &map1, const unordered_map<T, V> &map2) {
        vector<tuple<DiffType, T, V, V>> diffWithSameKey;
        vector<tuple<DiffType, T, V, V>> otherDiffs;
        for (auto it = begin(map1); it != end(map1); ++ it) {
            auto key = it->first;
            auto value = it->second;
            if (map2.count(key)) {
                // if the key appears in both map
                if (value != map2[key]) {
                    diffWithSameKey.push_back(make_tuple(VALUE, key, value, map2[key]));
                }
            } else {
                // second map doesn't have this key
                otherDiffs.push_back(make_tuple(PLUS, key, value, value));
            }
        }        
        return {diffWithSameKey, otherDiffs};
    }    
};

Distributed Version of Map Difference

If this is to run on a distributed environment where a single map may not be fully loaded on one physical server/machine, we can sort the Map by keys or use the map (sorted by Keys) in C++. Then, similar than merging two sorted list, we use two iterator (pointer) to point to begining of each map, then advance the iterator/pointer as the comparison goes on.

The time complexity (because of overhead by maintaining the sorted keys), would be O(NlogN+MlogM) as compared to O(N+M) in the first version.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <map>
#include <vector>
 
using namespace std;
 
enum DiffType {
    VALUE,
    MINUS,
    PLUS
};
 
template <class T, class V>
// typedef vector<tuple<DiffType, T, V, V> ArrayOfDiffs;
 
template <class T, class V>
class MapDiff {
public:
    MapDiff(const unordered_map<T, V> &m) {
        data = m;
    }
    
    vector<tuple<DiffType, T, V, V>> getDiff(const map<T, V> &anotherMap) {
        auto diffs = getMapDiff(data, anotherMap);
        auto anotherDiffs = getMapDiff(anotherMap, data);
        vector<tuple<DiffType, T, V, V>> result(diffs.first);
        result.insert(end(result), begin(diffs->second), end(diffs->second));
        result.insert(end(result), begin(anotherDiffs->second), end(anotherDiffs->second));
        return result;
    }
    
private:
    map<T, V> data;
        
    // O(nlogn + mlogm)
    pair<vector<tuple<DiffType, T, V, V>>, vector<tuple<DiffType, T, V, V>>> getMapDiffV2(
        const map<T, V> &map1, const map<T, V> &map2) {
        vector<tuple<DiffType, T, V, V>> diffWithSameKey;
        vector<tuple<DiffType, T, V, V>> otherDiffs;
        auto itOfMap1 = begin(map1);
        auto itOfMap2 = begin(map2);        
        while (itOfMap1 != end(map1) && itOfMap2 != end(map2)) {
            if (itOfMap1->first == itOfMap2->first) {
                // if the key appears in both map
                if (itOfMap1->second != itOfMap2->second) {
                    diffWithSameKey.push_back(make_tuple(VALUE, key, itOfMap1->second, itOfMap2->second));
                }
                itOfMap1 ++;                
                itOfMap2 ++;
            } else {
                if (itOfMap1->first < itOfMap2->first) {
                    otherDiffs.push_back(make_tuple(PLUS, key, itOfMap1->second, itOfMap2->second));
                    itOfMap1 ++;
                } else {
                    otherDiffs.push_back(make_tuple(MINUS, key, itOfMap1->second, itOfMap2->second));
                    itOfMap2 ++;
                }
            }            
        }
        return {diffWithSameKey, otherDiffs};
    }    
};
#include <map>
#include <vector>

using namespace std;

enum DiffType {
    VALUE,
    MINUS,
    PLUS
};

template <class T, class V>
// typedef vector<tuple<DiffType, T, V, V> ArrayOfDiffs;

template <class T, class V>
class MapDiff {
public:
    MapDiff(const unordered_map<T, V> &m) {
        data = m;
    }
    
    vector<tuple<DiffType, T, V, V>> getDiff(const map<T, V> &anotherMap) {
        auto diffs = getMapDiff(data, anotherMap);
        auto anotherDiffs = getMapDiff(anotherMap, data);
        vector<tuple<DiffType, T, V, V>> result(diffs.first);
        result.insert(end(result), begin(diffs->second), end(diffs->second));
        result.insert(end(result), begin(anotherDiffs->second), end(anotherDiffs->second));
        return result;
    }
    
private:
    map<T, V> data;
        
    // O(nlogn + mlogm)
    pair<vector<tuple<DiffType, T, V, V>>, vector<tuple<DiffType, T, V, V>>> getMapDiffV2(
        const map<T, V> &map1, const map<T, V> &map2) {
        vector<tuple<DiffType, T, V, V>> diffWithSameKey;
        vector<tuple<DiffType, T, V, V>> otherDiffs;
        auto itOfMap1 = begin(map1);
        auto itOfMap2 = begin(map2);        
        while (itOfMap1 != end(map1) && itOfMap2 != end(map2)) {
            if (itOfMap1->first == itOfMap2->first) {
                // if the key appears in both map
                if (itOfMap1->second != itOfMap2->second) {
                    diffWithSameKey.push_back(make_tuple(VALUE, key, itOfMap1->second, itOfMap2->second));
                }
                itOfMap1 ++;                
                itOfMap2 ++;
            } else {
                if (itOfMap1->first < itOfMap2->first) {
                    otherDiffs.push_back(make_tuple(PLUS, key, itOfMap1->second, itOfMap2->second));
                    itOfMap1 ++;
                } else {
                    otherDiffs.push_back(make_tuple(MINUS, key, itOfMap1->second, itOfMap2->second));
                    itOfMap2 ++;
                }
            }            
        }
        return {diffWithSameKey, otherDiffs};
    }    
};

Some Test cases to unit test the solution or if you prefer Test Driven Development.

1
2
3
4
5
6
A = {}, B = {} ==> []
A = {}, B = {a: b} => [(Plus, a, b)]
A = {a: b}, B = {} => [(Minus, a, b)]
A = {a: b}, B = {a: b} => []
A = {A: b}, B = {a: b} => [(Plus, a, b), (Minus, A, b)]
A = {A: b}, B = {A, c} => [(VALUE, A, b, c)]
A = {}, B = {} ==> []
A = {}, B = {a: b} => [(Plus, a, b)]
A = {a: b}, B = {} => [(Minus, a, b)]
A = {a: b}, B = {a: b} => []
A = {A: b}, B = {a: b} => [(Plus, a, b), (Minus, A, b)]
A = {A: b}, B = {A, c} => [(VALUE, A, b, c)]

–EOF (The Ultimate Computing & Technology Blog) —

GD Star Rating
loading...
1113 words
Last Post: Teaching Kids Programming - Introduction to Object Oriented Programming
Next Post: Teaching Kids Programming - Algorithms to Search in Binary Search Tree

The Permanent URL is: Algorithms to Compute the Difference of Two Maps

Leave a Reply