+ int i;
+
+ for (i = 0; i < len - 1; i++)
+ token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++;
+}
+
+/* decrease the count for all the possible tokens in a symbol */
+static void forget_symbol(unsigned char *symbol, int len)
+{
+ int i;
+
+ for (i = 0; i < len - 1; i++)
+ token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--;
+}
+
+/* remove all the invalid symbols from the table and do the initial token count */
+static void build_initial_tok_table(void)
+{
+ unsigned int i, pos;
+
+ pos = 0;
+ for (i = 0; i < table_cnt; i++) {
+ if ( symbol_valid(&table[i]) ) {
+ if (pos != i)
+ table[pos] = table[i];
+ learn_symbol(table[pos].sym, table[pos].len);
+ pos++;
+ }
+ }
+ table_cnt = pos;
+}
+
+/* replace a given token in all the valid symbols. Use the sampled symbols
+ * to update the counts */
+static void compress_symbols(unsigned char *str, int idx)
+{
+ unsigned int i, len, size;
+ unsigned char *p1, *p2;
+
+ for (i = 0; i < table_cnt; i++) {
+
+ len = table[i].len;
+ p1 = table[i].sym;
+
+ /* find the token on the symbol */
+ p2 = memmem(p1, len, str, 2);
+ if (!p2) continue;
+
+ /* decrease the counts for this symbol's tokens */
+ forget_symbol(table[i].sym, len);
+
+ size = len;
+
+ do {
+ *p2 = idx;
+ p2++;
+ size -= (p2 - p1);
+ memmove(p2, p2 + 1, size);
+ p1 = p2;
+ len--;
+
+ if (size < 2) break;
+
+ /* find the token on the symbol */
+ p2 = memmem(p1, size, str, 2);
+
+ } while (p2);
+
+ table[i].len = len;
+
+ /* increase the counts for this symbol's new tokens */
+ learn_symbol(table[i].sym, len);
+ }
+}
+
+/* search the token with the maximum profit */
+static int find_best_token(void)
+{
+ int i, best, bestprofit;
+
+ bestprofit=-10000;
+ best = 0;
+
+ for (i = 0; i < 0x10000; i++) {
+ if (token_profit[i] > bestprofit) {
+ best = i;
+ bestprofit = token_profit[i];
+ }
+ }
+ return best;
+}
+
+/* this is the core of the algorithm: calculate the "best" table */
+static void optimize_result(void)
+{
+ int i, best;
+
+ /* using the '\0' symbol last allows compress_symbols to use standard
+ * fast string functions */
+ for (i = 255; i >= 0; i--) {
+
+ /* if this table slot is empty (it is not used by an actual
+ * original char code */
+ if (!best_table_len[i]) {
+
+ /* find the token with the breates profit value */
+ best = find_best_token();
+
+ /* place it in the "best" table */
+ best_table_len[i] = 2;
+ best_table[i][0] = best & 0xFF;
+ best_table[i][1] = (best >> 8) & 0xFF;
+
+ /* replace this token in all the valid symbols */
+ compress_symbols(best_table[i], i);
+ }
+ }
+}
+
+/* start by placing the symbols that are actually used on the table */
+static void insert_real_symbols_in_table(void)
+{
+ unsigned int i, j, c;
+
+ memset(best_table, 0, sizeof(best_table));
+ memset(best_table_len, 0, sizeof(best_table_len));
+
+ for (i = 0; i < table_cnt; i++) {
+ for (j = 0; j < table[i].len; j++) {
+ c = table[i].sym[j];
+ best_table[c][0]=c;
+ best_table_len[c]=1;
+ }
+ }
+}
+
+static void optimize_token_table(void)
+{
+ build_initial_tok_table();
+
+ insert_real_symbols_in_table();
+
+ /* When valid symbol is not registered, exit to error */
+ if (!table_cnt) {
+ fprintf(stderr, "No valid symbol.\n");
+ exit(1);
+ }
+
+ optimize_result();
+}
+
+
+int main(int argc, char **argv)
+{
+ if (argc >= 2) {
+ int i;
+ for (i = 1; i < argc; i++) {
+ if(strcmp(argv[i], "--all-symbols") == 0)
+ all_symbols = 1;
+ else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) {
+ char *p = &argv[i][16];
+ /* skip quote */
+ if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\''))
+ p++;
+ symbol_prefix_char = *p;
+ } else
+ usage();
+ }
+ } else if (argc != 1)