/* =========================================================================== * Filename: rel_count.cc * Description: Relation count class * * Version: $Rev: 17 $ * Changed: $Date: 2005-09-08 03:13:25 +0200 (Do, 08 Sep 2005) $ * Licence: GPL (read COPYING file for details) * * Author: Erich Schubert (eS), erich@debian.org * Institut für Informatik, LMU München * ======================================================================== */ #include "rel_count.h" #include /* for make_pair */ namespace SSD { /* for static lookup hashmap */ unsigned int RelCount::len = 0; hash_map RelCount::cmap; RelCount::RelCount(hash_map& map1, hash_map& map2) : data(NULL) { hash_map::iterator i1, i2; vector tmpdata; /* initialize the first RelCount from the tables of two documents */ if (len != 0) throw "RelCount has been initialized before"; for (i1 = map1.begin(); i1 != map1.end(); i1++) { i2 = map2.find(i1->first); if (i2 == map2.end()) { cmap[i1->first] = RELCOUNT_UNIQUE; } else { if (i1->second == i2->second && i1->second == 1) { cmap[i1->first] = RELCOUNT_ONCE; } else { cmap[i1->first] = len; tmpdata.push_back(i1->second - i2->second); len++; } } } for (i2 = map2.begin(); i2 != map2.end(); i2++) { i1 = map1.find(i2->first); if (i1 == map1.end()) { cmap[i2->first] = RELCOUNT_UNIQUE; } } /* copy data to the minimal data structure */ data = (short*) calloc(len, sizeof(short)); if (tmpdata.size() != len) throw "Inconsitency detected!"; for (unsigned int i=0; i 0 && rc.data) { data = (short*) calloc(len, sizeof(short)); memcpy(data, rc.data, sizeof(short)*len); } } RelCount::RelCount() : data(NULL) { if (len > 0) { data = (short*) calloc(len, sizeof(short)); } } RelCount::~RelCount() { if (data) { free(data); data=NULL; } } int RelCount::modify(RelEqClass key, short val) { int cost=0; /* find the key in lookup table */ hash_map::iterator pos = cmap.find(key); if (pos != cmap.end()) { /* unique keys don't generate costs */ if (pos->second == RELCOUNT_UNIQUE) return 0; /* key occuring once each have a cost of 1 if dropped from the first document */ /* TODO: can it happen that we know first we'll drop it in the * second document? likely? then we should assign costs early in * that case somehow, too! Bitset? */ if (pos->second == RELCOUNT_ONCE) return (val > 0) ? 1 : 0; #ifdef CAREFUL if (pos->second >= len) throw "pos->second out of range in RelCount::modify"; #endif int oldval = data[pos->second]; int newval = oldval - val; data[pos->second] = newval; /* calculate costs */ if (oldval > 0) { if ( val < 0) cost += -val; if (newval < 0) cost += -newval; } else if (oldval < 0) { if ( val > 0) cost += val; if (newval > 0) cost += newval; } else /* if (oldval == 0) */ { cost = abs(val); /* == abs(newval) */ } } else throw "New key encountered during 'modify'."; return cost; } void RelCount::operator+=(const RelCount& other) { for (unsigned int i=0; i::iterator i; for (i = cmap.begin(); i != cmap.end(); i++) { out << i->first << ": " << i->second << " "; } } int RelCount::calc_max_retained(hash_map& map1, hash_map& map2) { hash_map::iterator i1, i2; int max_retained=0; for (i1 = map1.begin(); i1 != map1.end(); i1++) { i2 = map2.find(i1->first); if (i2 != map2.end()) { max_retained += min(i1->second, i2->second); } } return max_retained; } }