+
+/* build the counts for all the tokens that start with "data", and have lenghts
+ * from 2 to "len" */
+static void learn_token(unsigned char *data, int len)
+{
+ struct token *ptr,*last_ptr;
+ int i, newprofit;
+ unsigned int hash = HASH_BASE_OFFSET;
+ unsigned int hashes[MAX_TOK_SIZE + 1];
+
+ if (len > MAX_TOK_SIZE)
+ len = MAX_TOK_SIZE;
+
+ /* calculate and store the hash values for all the sub-tokens */
+ hash = rehash_token(hash, data[0]);
+ for (i = 2; i <= len; i++) {
+ hash = rehash_token(hash, data[i-1]);
+ hashes[i] = HASH_FOLD(hash);
+ }
+
+ last_ptr = NULL;
+ ptr = NULL;
+
+ for (i = len; i >= 2; i--) {
+ hash = hashes[i];
+
+ if (!ptr) ptr = find_token_hash(data, i, hash);
+
+ if (!ptr) {
+ /* create a new token entry */
+ ptr = (struct token *) malloc(sizeof(*ptr));
+
+ memcpy(ptr->data, data, i);
+ ptr->len = i;
+
+ /* when we create an entry, it's profit is 0 because
+ * we also take into account the size of the token on
+ * the compressed table. We then subtract GOOD_BAD_THRESHOLD
+ * so that the test to see if this token belongs to
+ * the good or bad list, is a comparison to zero */
+ ptr->profit = -GOOD_BAD_THRESHOLD;
+
+ ptr->next = hash_table[hash];
+ hash_table[hash] = ptr;
+
+ insert_token_in_group(&bad_head, ptr);
+
+ ptr->smaller = NULL;
+ } else {
+ newprofit = ptr->profit + (ptr->len - 1);
+ /* check to see if this token needs to be moved to a
+ * different list */
+ if((ptr->profit < 0) && (newprofit >= 0)) {
+ remove_token_from_group(ptr);
+ insert_token_in_group(&good_head,ptr);
+ }
+ ptr->profit = newprofit;
+ }
+
+ if (last_ptr) last_ptr->smaller = ptr;
+ last_ptr = ptr;
+
+ ptr = ptr->smaller;
+ }
+}
+
+/* decrease the counts for all the tokens that start with "data", and have lenghts
+ * from 2 to "len". This function is much simpler than learn_token because we have
+ * more guarantees (tho tokens exist, the ->smaller pointer is set, etc.)
+ * The two separate functions exist only because of compression performance */
+static void forget_token(unsigned char *data, int len)
+{
+ struct token *ptr;
+ int i, newprofit;
+ unsigned int hash=0;
+
+ if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE;
+
+ hash = hash_token(data, len);
+ ptr = find_token_hash(data, len, hash);
+
+ for (i = len; i >= 2; i--) {
+
+ newprofit = ptr->profit - (ptr->len - 1);
+ if ((ptr->profit >= 0) && (newprofit < 0)) {
+ remove_token_from_group(ptr);
+ insert_token_in_group(&bad_head, ptr);
+ }
+ ptr->profit=newprofit;
+
+ ptr=ptr->smaller;
+ }
+}
+
+/* count all the possible tokens in a symbol */
+static void learn_symbol(unsigned char *symbol, int len)
+{
+ int i;
+
+ for (i = 0; i < len - 1; i++)
+ learn_token(symbol + i, len - i);
+}
+
+/* decrease the count for all the possible tokens in a symbol */
+static void forget_symbol(unsigned char *symbol, int len)
+{
+ int i;
+
+ for (i = 0; i < len - 1; i++)
+ forget_token(symbol + i, len - i);
+}
+
+/* set all the symbol flags and do the initial token count */
+static void build_initial_tok_table(void)
+{
+ int i, use_it, valid;
+
+ valid = 0;
+ for (i = 0; i < cnt; i++) {
+ table[i].flags = 0;
+ if ( symbol_valid(&table[i]) ) {
+ table[i].flags |= SYM_FLAG_VALID;
+ valid++;
+ }
+ }
+
+ use_it = 0;
+ for (i = 0; i < cnt; i++) {
+
+ /* subsample the available symbols. This method is almost like
+ * a Bresenham's algorithm to get uniformly distributed samples
+ * across the symbol table */
+ if (table[i].flags & SYM_FLAG_VALID) {
+
+ use_it += WORKING_SET;
+
+ if (use_it >= valid) {
+ table[i].flags |= SYM_FLAG_SAMPLED;
+ use_it -= valid;
+ }
+ }
+ if (table[i].flags & SYM_FLAG_SAMPLED)
+ learn_symbol(table[i].sym, table[i].len);
+ }
+}
+
+/* replace a given token in all the valid symbols. Use the sampled symbols
+ * to update the counts */
+static void compress_symbols(unsigned char *str, int tlen, int idx)
+{
+ int i, len, learn, size;
+ unsigned char *p;
+
+ for (i = 0; i < cnt; i++) {
+
+ if (!(table[i].flags & SYM_FLAG_VALID)) continue;
+
+ len = table[i].len;
+ learn = 0;
+ p = table[i].sym;
+
+ do {
+ /* find the token on the symbol */
+ p = (unsigned char *) strstr((char *) p, (char *) str);
+ if (!p) break;
+
+ if (!learn) {
+ /* if this symbol was used to count, decrease it */
+ if (table[i].flags & SYM_FLAG_SAMPLED)
+ forget_symbol(table[i].sym, len);
+ learn = 1;
+ }
+
+ *p = idx;
+ size = (len - (p - table[i].sym)) - tlen + 1;
+ memmove(p + 1, p + tlen, size);
+ p++;
+ len -= tlen - 1;
+
+ } while (size >= tlen);
+
+ if(learn) {
+ table[i].len = len;
+ /* if this symbol was used to count, learn it again */
+ if(table[i].flags & SYM_FLAG_SAMPLED)
+ learn_symbol(table[i].sym, len);
+ }
+ }
+}
+
+/* search the token with the maximum profit */
+static struct token *find_best_token(void)
+{
+ struct token *ptr,*best,*head;
+ int bestprofit;
+
+ bestprofit=-10000;
+
+ /* failsafe: if the "good" list is empty search from the "bad" list */
+ if(good_head.right == &good_head) head = &bad_head;
+ else head = &good_head;
+
+ ptr = head->right;
+ best = NULL;
+ while (ptr != head) {
+ if (ptr->profit > bestprofit) {
+ bestprofit = ptr->profit;
+ best = ptr;
+ }
+ ptr = ptr->right;
+ }
+
+ return best;
+}
+
+/* this is the core of the algorithm: calculate the "best" table */
+static void optimize_result(void)
+{
+ struct token *best;
+ int i;
+
+ /* using the '\0' symbol last allows compress_symbols to use standard
+ * fast string functions */
+ for (i = 255; i >= 0; i--) {
+
+ /* if this table slot is empty (it is not used by an actual
+ * original char code */
+ if (!best_table_len[i]) {
+
+ /* find the token with the breates profit value */
+ best = find_best_token();
+
+ /* place it in the "best" table */
+ best_table_len[i] = best->len;
+ memcpy(best_table[i], best->data, best_table_len[i]);
+ /* zero terminate the token so that we can use strstr
+ in compress_symbols */
+ best_table[i][best_table_len[i]]='\0';
+
+ /* replace this token in all the valid symbols */
+ compress_symbols(best_table[i], best_table_len[i], i);
+ }
+ }
+}
+
+/* start by placing the symbols that are actually used on the table */
+static void insert_real_symbols_in_table(void)
+{
+ int i, j, c;
+
+ memset(best_table, 0, sizeof(best_table));
+ memset(best_table_len, 0, sizeof(best_table_len));
+
+ for (i = 0; i < cnt; i++) {
+ if (table[i].flags & SYM_FLAG_VALID) {
+ for (j = 0; j < table[i].len; j++) {
+ c = table[i].sym[j];
+ best_table[c][0]=c;
+ best_table_len[c]=1;
+ }
+ }
+ }
+}
+
+static void optimize_token_table(void)
+{
+ memset(hash_table, 0, sizeof(hash_table));
+
+ good_head.left = &good_head;
+ good_head.right = &good_head;
+
+ bad_head.left = &bad_head;
+ bad_head.right = &bad_head;
+
+ build_initial_tok_table();
+
+ insert_real_symbols_in_table();
+
+ optimize_result();
+}
+
+