From c9d66e44af5c93a1ea5487fd9bff78274be65850 Mon Sep 17 00:00:00 2001 From: Manuel Novoa III Date: Fri, 20 Dec 2002 19:26:35 +0000 Subject: The big thing is locale dependent collation support. Also added outdigit support and (legacy) YESSTR/NOSTR support. --- extra/locale/gen_collate.c | 3920 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3920 insertions(+) create mode 100644 extra/locale/gen_collate.c (limited to 'extra/locale/gen_collate.c') diff --git a/extra/locale/gen_collate.c b/extra/locale/gen_collate.c new file mode 100644 index 000000000..9121345bc --- /dev/null +++ b/extra/locale/gen_collate.c @@ -0,0 +1,3920 @@ +/* TODO: + * + * add UNDEFINED at end if not specified + * convert POSITION -> FORWARD,POSITION + * + * + * deal with lowercase in + * + * what about reorders that keep the same rule? + * + * remove "unused" collation elements? (probably doesn't save much) + * + * add_rule function ... returns index into rule table after possibly adding custom-indexed rule + * but don't forget about multichar weights... replace with strings of indexes + * + */ + + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +typedef struct { + char *name; /* */ + + int num_weights; /* */ + + int ii_shift; /* */ + int ti_shift; /* */ + int ii_len; /* */ + int ti_len; /* */ + int max_weight; /* */ + int num_col_base; /* */ + int max_col_index; /* */ + int undefined_idx; /* */ + int range_low; /* */ + int range_count; /* high - low */ + int range_base_weight; /* */ + int num_starters; /* */ + + int range_rule_offset; /* */ + int wcs2colidt_offset; /* */ + int index2weight_offset; /* */ + int index2ruleidx_offset; /* */ + int multistart_offset; /* */ + +} base_locale_t; + +#define BASE_LOCALE_LEN 20 +static base_locale_t base_locale_array[BASE_LOCALE_LEN]; +static size_t base_locale_len; + +typedef struct { + char *name; /* */ + + int base_idx; /* */ + + int undefined_idx; /* */ + + int overrides_offset; /* */ + int multistart_offset; /* */ +} der_locale_t; + +#define DER_LOCALE_LEN 300 +static der_locale_t der_locale_array[DER_LOCALE_LEN]; +static size_t der_locale_len; + + +#define OVERRIDE_LEN 50000 +static uint16_t override_buffer[OVERRIDE_LEN]; +static size_t override_len; + +#define MULTISTART_LEN 10000 +static uint16_t multistart_buffer[MULTISTART_LEN]; +static size_t multistart_len; + +#define WCS2COLIDT_LEN 200000 +static uint16_t wcs2colidt_buffer[WCS2COLIDT_LEN]; +static size_t wcs2colidt_len; + +#define INDEX2WEIGHT_LEN 200000 +static uint16_t index2weight_buffer[INDEX2WEIGHT_LEN]; +static size_t index2weight_len; + +static uint16_t index2ruleidx_buffer[INDEX2WEIGHT_LEN]; +static size_t index2ruleidx_len; + +#define WEIGHTSTR_LEN 10000 +static uint16_t weightstr_buffer[WEIGHTSTR_LEN]; +static size_t weightstr_len; + +#define RULETABLE_LEN (1L<<16) +static uint16_t ruletable_buffer[RULETABLE_LEN]; +static size_t ruletable_len; + + +#define RANGE (0x10000UL) + +typedef uint16_t tbl_item; + +static uint16_t u16_buf[10000]; +static int u16_buf_len; +static int u16_starter; + +typedef struct { + uint16_t ii_len; + uint16_t ti_len; + uint16_t ut_len; + + unsigned char ii_shift; + unsigned char ti_shift; + + tbl_item *ii; + tbl_item *ti; + tbl_item *ut; +} table_data; + + +static size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl); + + +#define MAX_COLLATION_WEIGHTS 4 + +#define MAX_FNO 1 +#define MAX_FILES (MAX_FNO + 1) + +static FILE *fstack[MAX_FILES]; +static char *fname[MAX_FILES]; +static int lineno[MAX_FILES]; +static int fno = -1; + +static tbl_item wcs2index[RANGE]; + +static char linebuf[1024]; +static char *pos; +static char *pos_e = NULL; +static char end_of_token = 0; /* slot to save */ + +#define IN_ORDER 0x01 +#define IN_REORDER 0x02 +#define IN_REORDER_SECTIONS 0x04 +static int order_state; +static int cur_num_weights; /* number of weights in current use */ +static char cur_rule[MAX_COLLATION_WEIGHTS]; + +static int anonsection = 0; + +typedef struct ll_item_struct ll_item_t; + +struct ll_item_struct { + ll_item_t *next; + ll_item_t *prev; + void *data; + int data_type; + int idx; +}; + +static ll_item_t *reorder_section_ptr = NULL; +static int superset; +static int superset_order_start_cnt; /* only support one order for now */ +static int superset_in_sync; +static ll_item_t *comm_cur_ptr; +static ll_item_t *comm_prev_ptr; + +enum { + R_FORWARD = 0x01, + R_POSITION = 0x02, + R_BACKWARD = 0x04 /* must be largest in value */ +}; + +typedef struct { + size_t num_weights; + char rule[MAX_COLLATION_WEIGHTS]; + const char *colitem[MAX_COLLATION_WEIGHTS]; +} weight_t; + +static void *root_weight = NULL; +size_t unique_weights = 0; + +typedef struct { + const char *symbol; + weight_t *weight; +} weighted_item_t; + +typedef struct { + const char *symbol1; + const char *symbol2; + int length; + weight_t *weight; +} range_item_t; + +typedef struct { + const char *name; + ll_item_t *itm_list; /* weighted_item_t list .. circular!!! */ + size_t num_items; + size_t num_rules; + char rules[MAX_COLLATION_WEIGHTS]; +} section_t; + +static section_t *cur_section = NULL; + +typedef struct { + const char *symbol; + ll_item_t *node; +} wi_index_t; + +typedef struct col_locale_struct col_locale_t; + +struct col_locale_struct { + char *name; + void *root_colitem; /* all base and derived, or just derived */ + void *root_element; + void *root_scripts; + void *root_wi_index; + void *root_wi_index_reordered; + ll_item_t *section_list; + col_locale_t *base_locale; /* null if this is a base */ + void *root_derived_wi; + ll_item_t *derived_list; + void *root_starter_char; + void *root_starter_all; + ll_item_t *undefined_idx; +}; + +typedef struct { + const char *symbol; + int idx; +} col_index_t; + +static void *root_col_locale = NULL; + +typedef struct { + const char *keyword; + void (*handler)(void); +} keyword_table_t; + +typedef struct { + const char *string; + const char *element; /* NULL if collating symbol */ +} colitem_t; + +static col_locale_t *cur_base = NULL; +static col_locale_t *cur_derived = NULL; +static col_locale_t *cur_col = NULL; + +static void *root_sym = NULL; +static size_t num_sym = 0; +static size_t mem_sym = 0; + +static void error_msg(const char *fmt, ...) __attribute__ ((noreturn, format (printf, 1, 2))); +static void *xmalloc(size_t n); +static char *xsymdup(const char *s); /* only allocate once... store in a tree */ +static void pushfile(char *filename); +static void popfile(void); +static void processfile(void); +static int iscommentchar(int); +static void eatwhitespace(void); +static int next_line(void); +static char *next_token(void); +static void do_unrecognized(void); +static col_locale_t *new_col_locale(char *name); +static ll_item_t *new_ll_item(int data_type, void *data); +static weight_t *register_weight(weight_t *w); +static size_t ll_len(ll_item_t *l); +static size_t ll_count(ll_item_t *l, int mask); +static void add_wi_index(ll_item_t *l); +static size_t tnumnodes(const void *root); +static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl); +static void mark_reordered(const char *sym); +static ll_item_t *find_wi_index_reordered(const char *sym); +static ll_item_t *next_comm_ptr(void); +static ll_item_t *init_comm_ptr(void); +static ll_item_t *find_ll_last(ll_item_t *p); +static void dump_weights(const char *name); +static void finalize_base(void); +static int is_ucode(const char *s); +static int sym_cmp(const void *n1, const void *n2); +static void do_starter_lists(col_locale_t *cl); +static void dump_base_locale(int n); +static void dump_der_locale(int n); +static void dump_collate(FILE *fp); + +enum { + DT_SECTION = 0x01, + DT_WEIGHTED = 0x02, + DT_REORDER = 0x04, /* a section to support reorder_after */ + DT_COL_LOCALE = 0x08, + DT_RANGE = 0x10, +}; + +static section_t *new_section(const char *name) +{ + section_t *p; + char buf[128]; + + p = xmalloc(sizeof(section_t)); + if (!name) { /* anonymous section */ + name = buf; + snprintf(buf, sizeof(buf), "anon%05d", anonsection); + ++anonsection; + } else if (*name != '<') { /* reorder */ + name = buf; + snprintf(buf, sizeof(buf), "%s %05d", cur_col->name, anonsection); + ++anonsection; + } +#warning devel code +/* fprintf(stderr, "section %s\n", name); */ + p->name = xsymdup(name); + p->itm_list = NULL; + p->num_items = 0; + p->num_rules = 0; + memset(p->rules, 0, MAX_COLLATION_WEIGHTS); +/* cur_num_weights = p->num_rules = 0; */ +/* memset(p->rules, 0, MAX_COLLATION_WEIGHTS); */ +/* memset(cur_rule, R_FORWARD, 4); */ + +#warning devel code + if (*p->name == 'a') { + cur_num_weights = p->num_rules = 4; + memset(p->rules, R_FORWARD, 4); + memset(cur_rule, R_FORWARD, 4); + p->rules[3] |= R_POSITION; + cur_rule[3] |= R_POSITION; + } +/* fprintf(stderr, "new section %s -- cur_num_weights = %d\n", p->name, cur_num_weights); */ + + return p; +} + + + +static void do_order_start(void); +static void do_order_end(void); +static void do_reorder_after(void); +static void do_reorder_end(void); +static void do_reorder_sections_after(void); +static void do_reorder_sections_end(void); +static void do_copy(void); +static void do_colsym(void); +static void do_colele(void); +static void do_script(void); +static void do_range(void); + +static col_locale_t *new_col_locale(char *name); +static int colitem_cmp(const void *n1, const void *n2); +static int colelement_cmp(const void *n1, const void *n2); +static void del_colitem(colitem_t *p); +static colitem_t *new_colitem(char *item, char *def); +static void add_colitem(char *item, char *def); +static void add_script(const char *s); +static unsigned int add_rule(weighted_item_t *wi); +static unsigned int add_range_rule(range_item_t *ri); + +static const keyword_table_t keyword_table[] = { + { "collating-symbol", do_colsym }, + { "collating-element", do_colele }, + { "script", do_script }, + { "copy", do_copy }, + { "order_start", do_order_start }, + { "order_end", do_order_end }, + { "order-end", do_order_end }, + { "reorder-after", do_reorder_after }, + { "reorder-end", do_reorder_end }, + { "reorder-sections-after", do_reorder_sections_after }, + { "reorder-sections-end", do_reorder_sections_end }, + { "UCLIBC_RANGE", do_range }, + { NULL, do_unrecognized } +}; + + +static void do_unrecognized(void) +{ +#if 1 + error_msg("warning: unrecognized: %s", pos); +#else +/* fprintf(stderr, "warning: unrecognized initial keyword \"%s\"\n", pos); */ + fprintf(stderr, "warning: unrecognized: %s", pos); + if (end_of_token) { + fprintf(stderr, "%c%s", end_of_token, pos_e+1); + } + fprintf(stderr, "\n"); +#endif +} + +/* typedef struct { */ +/* const char *symbol1; */ +/* const char *symbol2; */ +/* int length; */ +/* weight_t *weight; */ +/* } range_item_t; */ + +static void do_range(void) +{ + range_item_t *ri; + weight_t w; + int i; + char *s; + char *s1; + char *s2; + const char **ci; + ll_item_t *lli; + + assert(!superset); + assert(order_state == IN_ORDER); + + s1 = next_token(); + if (!s1) { + error_msg("missing start of range"); + } + if (!is_ucode(s1)) { + error_msg("start of range is not a ucode: %s", s1); + } + s1 = xsymdup(s1); + + s2 = next_token(); + if (!s2) { + error_msg("missing end of range"); + } + if (!is_ucode(s2)) { + error_msg("end of range is not a ucode: %s", s2); + } + s2 = xsymdup(s2); + + ri = (range_item_t *) xmalloc(sizeof(range_item_t)); + ri->symbol1 = s1; + ri->symbol2 = s2; + ri->length = strtoul(s2+2, NULL, 16) - strtoul(s1+2, NULL, 16); + if (ri->length <= 0) { + error_msg("illegal range length %d", ri->length); + } + + s = next_token(); + w.num_weights = cur_num_weights; + + for (i=0 ; i < cur_num_weights ; i++) { + w.rule[i] = cur_rule[i]; + } + ci = w.colitem + (i-1); + /* now i == cur_num_weights */ + +#define STR_DITTO "." + + while (s && *s && i) { + --i; + if (*s == ';') { + ci[-i] = xsymdup(STR_DITTO); + if (*++s) { + continue; + } + } + if (*s) { + ci[-i] = xsymdup(s); + } + s = next_token(); + if (s) { + if (*s == ';') { + ++s; + } else if (i) { + error_msg("missing seperator"); + } + } + } + if (s) { + error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); + } + + while (i) { /* missing weights are not an error */ + --i; + ci[-i] = xsymdup(STR_DITTO); + } + + ri->weight = register_weight(&w); + +/* if ((i = is_ucode(t)) != 0) { */ +/* assert(!t[i]); */ +/* add_colitem(t, NULL); */ +/* } */ + + lli = new_ll_item(DT_RANGE, ri); + if (!cur_section->itm_list) { +/* printf("creating new item list: %s\n", wi->symbol); */ + cur_section->itm_list = lli; + lli->prev = lli->next = lli; + ++cur_section->num_items; + } else { + insque(lli, cur_section->itm_list->prev); +/* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */ + ++cur_section->num_items; + } +/* add_wi_index(lli); */ + + +} + +static weighted_item_t *add_weight(char *t) +{ + weighted_item_t *wi; + weight_t w; + int i; + char *s; + const char **ci; + + t = xsymdup(t); + + s = next_token(); + w.num_weights = cur_num_weights; + + for (i=0 ; i < cur_num_weights ; i++) { + w.rule[i] = cur_rule[i]; + } + ci = w.colitem + (i-1); + /* now i == cur_num_weights */ + + while (s && *s && i) { + --i; + if (*s == ';') { + ci[-i] = xsymdup(STR_DITTO); + if (*++s) { + continue; + } + } + if (*s) { + if (!strcmp(s,t)) { + s = STR_DITTO; + } + ci[-i] = xsymdup(s); + } + s = next_token(); + if (s) { + if (*s == ';') { + ++s; + } else if (i) { + error_msg("missing seperator"); + } + } + } + if (s) { + error_msg("too many weights: %d %d |%s| %d", cur_num_weights, i, s, (int)*s); + } + + while (i) { /* missing weights are not an error */ + --i; + ci[-i] = xsymdup(STR_DITTO); + } + + wi = xmalloc(sizeof(weighted_item_t)); + wi->symbol = t; + wi->weight = register_weight(&w); + + if ((i = is_ucode(t)) != 0) { + assert(!t[i]); + add_colitem(t, NULL); + } + + return wi; +} + +static void add_superset_weight(char *t) +{ + ll_item_t *lli; + weighted_item_t *wi; + + if (!comm_cur_ptr + || (strcmp(t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol) != 0) + ) { /* now out of sync */ + if (superset_in_sync) { /* need a new section */ + superset_in_sync = 0; + + cur_section = new_section("R"); + cur_num_weights = cur_section->num_rules + = ((section_t *)(cur_base->section_list->data))->num_rules; + memcpy(cur_rule, + ((section_t *)(cur_base->section_list->data))->rules, + MAX_COLLATION_WEIGHTS); + memcpy(cur_section->rules, + ((section_t *)(cur_base->section_list->data))->rules, + MAX_COLLATION_WEIGHTS); + + insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list)); + assert(comm_prev_ptr); + lli = new_ll_item(DT_REORDER, cur_section); + lli->prev = lli->next = lli; + insque(lli, comm_prev_ptr); +/* fprintf(stderr, " subsection -----------------------\n"); */ + } + +/* fprintf(stderr, " %s %s\n", t, ((weighted_item_t *)(comm_cur_ptr->data))->symbol); */ + wi = add_weight(t); + lli = new_ll_item(DT_WEIGHTED, wi); + mark_reordered(wi->symbol); + /* printf("reorder: %s\n", t); */ + if (!cur_section->itm_list) { + cur_section->itm_list = lli; + lli->prev = lli->next = lli; + ++cur_section->num_items; + } else { + insque(lli, cur_section->itm_list->prev); + ++cur_section->num_items; + } + add_wi_index(lli); + + } else { /* in sync */ + superset_in_sync = 1; + next_comm_ptr(); + } +} + +static void do_weight(char *t) +{ + weighted_item_t *wi; + ll_item_t *lli; + + if (superset) { + add_superset_weight(t); + return; + } + + switch(order_state) { + case 0: +/* printf("no-order weight: %s\n", t); */ +/* break; */ + case IN_ORDER: + /* in a section */ +/* printf("weight: %s\n", t); */ + wi = add_weight(t); + lli = new_ll_item(DT_WEIGHTED, wi); + if (!cur_section->itm_list) { +/* printf("creating new item list: %s\n", wi->symbol); */ + cur_section->itm_list = lli; + lli->prev = lli->next = lli; + ++cur_section->num_items; + } else { + insque(lli, cur_section->itm_list->prev); +/* printf("adding item to list: %d - %s\n", ll_len(cur_section->itm_list), wi->symbol); */ + ++cur_section->num_items; + } + add_wi_index(lli); + break; + case IN_REORDER: + /* std rule - but in a block with an insert-after pt */ + wi = add_weight(t); + lli = new_ll_item(DT_WEIGHTED, wi); + mark_reordered(wi->symbol); +/* printf("reorder: %s\n", t); */ + if (!cur_section->itm_list) { + cur_section->itm_list = lli; + lli->prev = lli->next = lli; + ++cur_section->num_items; + } else { + insque(lli, cur_section->itm_list->prev); + ++cur_section->num_items; + } + add_wi_index(lli); + break; + case IN_REORDER_SECTIONS: + t = xsymdup(t); + if (next_token() != NULL) { + error_msg("trailing text in reorder section item: %s", pos); + } + lli = cur_col->section_list; + do { + if (lli->data_type & DT_SECTION) { + if (!strcmp(((section_t *)(lli->data))->name, t)) { + lli->data_type = DT_REORDER; + lli = new_ll_item(DT_REORDER, (section_t *)(lli->data)); + insque(lli, reorder_section_ptr); + reorder_section_ptr = lli; + return; + } + } + lli = lli->next; + } while (lli); + error_msg("reorder_sections_after for non-base item currently not supported: %s", t); +/* fprintf(stderr, "reorder_secitons: %s\n", t); */ + break; + default: + error_msg("invalid order_state %d", order_state); + } +} + +static int col_locale_cmp(const void *n1, const void *n2) +{ + return strcmp(((const col_locale_t *) n1)->name, ((const col_locale_t *) n2)->name); +} + +static void processfile(void) +{ + char *t; + const keyword_table_t *k; + + order_state = 0; +#warning devel code +/* cur_num_weights = 0; */ +/* cur_num_weights = 4; */ +/* memset(cur_rule, R_FORWARD, 4); */ + + if (cur_col != cur_base) { + cur_col->base_locale = cur_base; + cur_col->undefined_idx = cur_base->undefined_idx; + if (!cur_base->derived_list) { + cur_base->derived_list = new_ll_item(DT_COL_LOCALE, cur_col); + } else { + insque(new_ll_item(DT_COL_LOCALE, cur_col), find_ll_last(cur_base->derived_list)); + } + } + + if (tfind(cur_col, &root_col_locale, col_locale_cmp)) { + error_msg("attempt to readd locale: %s", cur_col->name); + } + if (!tsearch(cur_col, &root_col_locale, col_locale_cmp)) { + error_msg("OUT OF MEMORY!"); + } + + if (superset) { + superset_order_start_cnt = 0; + superset_in_sync = 0; + init_comm_ptr(); + } + + while (next_line()) { +/* printf("%5d:", lineno[fno]); */ +/* while ((t = next_token()) != NULL) { */ +/* printf(" |%s|", t); */ +/* printf("\n"); */ +/* } */ + t = next_token(); + assert(t); + assert(t == pos); + if ((*t == '<') || (!strcmp(t, "UNDEFINED"))) { + do_weight(t); + } else { + for (k = keyword_table ; k->keyword ; k++) { + if (!strcmp(k->keyword, t)) { + break; + } + } + k->handler(); + } + } + + if (cur_base == cur_col) { + fprintf(stderr, "Base: %15s", cur_col->name); + } else { +#if 1 + if (!cur_col->undefined_idx) { +#if 0 + if (superset) { + if (superset_order_start_cnt == 1) { + --superset_order_start_cnt; /* ugh.. hack this */ + } + } +#endif + /* This is an awful hack to get around the problem of unspecified UNDEFINED + * definitions in the supported locales derived from iso14651_t1. */ + if (!strcmp(cur_base->name, "iso14651_t1")) { + fprintf(stderr, "Warning: adding UNDEFINED entry for %s\n", cur_col->name); + strcpy(linebuf, "order_start forward;backward;forward;forward,position\n"); + pos_e = NULL; + pos = linebuf; + t = next_token(); + assert(t); + assert(t == pos); + do_order_start(); + strcpy(linebuf, "UNDEFINED IGNORE;IGNORE;IGNORE\n"); + pos_e = NULL; + pos = linebuf; + t = next_token(); + assert(t); + assert(t == pos); + do_weight(t); + } else { + error_msg("no definition of UNDEFINED for %s", cur_col->name); + } + } +#endif + + fprintf(stderr, " Der: %15s", cur_col->name); + } + { + ll_item_t *p = cur_col->section_list; + + fprintf(stderr, "%6u weights", tnumnodes(cur_col->root_wi_index)); + if (cur_base) { + fprintf(stderr, " %6u der %6u reor %6u starter - %u new stubs", + tnumnodes(cur_base->root_derived_wi), + tnumnodes(cur_base->root_wi_index_reordered), + tnumnodes(cur_base->root_starter_char), + ll_count(cur_col->section_list, DT_REORDER)); + } + fprintf(stderr, "\n"); + +#if 0 + while (p) { + assert(((section_t *)(p->data))->num_items == + ll_len(((section_t *)(p->data))->itm_list)); + + + if (!p->next && + ((*((section_t *)(p->data))->name == 'a') + && (((section_t *)(p->data))->num_items == 0)) + ) { + break; + } + + if (!(p->data_type & DT_REORDER)) { + if ((*((section_t *)(p->data))->name != 'a') + || (((section_t *)(p->data))->num_items > 0) + ) { + fprintf(stderr, +/* "\t%-15s %zu\n", */ + "\t%-15s %6u\n", + ((section_t *)(p->data))->name, + ((section_t *)(p->data))->num_items); + } + } + p = p->next; + } +#endif + } + + +} + +static void print_colnode(const void *ptr, VISIT order, int level) +{ + const colitem_t *p = *(const colitem_t **) ptr; + + if (order == postorder || order == leaf) { + printf("collating item = \"%s\"", p->string); + if (p->element) { + printf(" is %s", p->element); + } + printf("\n"); + } +} + +static void print_weight_node(const void *ptr, VISIT order, int level) +{ + const weight_t *p = *(const weight_t **) ptr; + int i; + + if (order == postorder || order == leaf) { + printf("weight: (%d) ", p->num_weights); + for (i = 0 ; i < p->num_weights ; i++) { + if (p->rule[i] & R_FORWARD) { + printf("F"); + } + if (p->rule[i] & R_BACKWARD) { + printf("B"); + } + if (p->rule[i] & R_POSITION) { + printf("P"); + } + printf(","); + } + for (i = 0 ; i < p->num_weights ; i++) { + printf(" %s", p->colitem[i]); + } + printf("\n"); + } +} + + +typedef struct { + const char *der_name; + int base_locale; +} deps_t; + +enum { + BASE_iso14651_t1, + BASE_comm, + BASE_cs_CZ, + BASE_ar_SA, + BASE_th_TH, + BASE_ja_JP, + BASE_ko_KR, + BASE_MAX +}; + +static const char *base_name[] = { + "iso14651_t1", + "comm", + "cs_CZ", + "ar_SA", + "th_TH", + "ja_JP", + "ko_KR" +}; + + + +static ll_item_t *locale_list[BASE_MAX]; + +static void init_locale_list(void) +{ + int i; + + for (i=0 ; i < BASE_MAX ; i++) { + locale_list[i] = (ll_item_t *) xmalloc(sizeof(ll_item_t)); + locale_list[i]->prev = locale_list[i]->next = locale_list[i]; + locale_list[i]->data = (void *) base_name[i]; + } +} + + +deps_t deps[] = { + { "af_ZA", BASE_iso14651_t1 }, + { "am_ET", BASE_iso14651_t1 }, + { "ar_AE", BASE_iso14651_t1 }, + { "ar_BH", BASE_iso14651_t1 }, + { "ar_DZ", BASE_iso14651_t1 }, + { "ar_EG", BASE_iso14651_t1 }, + { "ar_IN", BASE_iso14651_t1 }, + { "ar_IQ", BASE_iso14651_t1 }, + { "ar_JO", BASE_iso14651_t1 }, + { "ar_KW", BASE_iso14651_t1 }, + { "ar_LB", BASE_iso14651_t1 }, + { "ar_LY", BASE_iso14651_t1 }, + { "ar_MA", BASE_iso14651_t1 }, + { "ar_OM", BASE_iso14651_t1 }, + { "ar_QA", BASE_iso14651_t1 }, + { "ar_SA", BASE_ar_SA }, + { "ar_SD", BASE_iso14651_t1 }, + { "ar_SY", BASE_iso14651_t1 }, + { "ar_TN", BASE_iso14651_t1 }, + { "ar_YE", BASE_iso14651_t1 }, + { "az_AZ", BASE_iso14651_t1 }, + { "be_BY", BASE_iso14651_t1 }, + { "bg_BG", BASE_iso14651_t1 }, + { "bn_BD", BASE_iso14651_t1 }, + { "bn_IN", BASE_iso14651_t1 }, + { "br_FR", BASE_iso14651_t1 }, + { "bs_BA", BASE_iso14651_t1 }, + { "ca_ES", BASE_comm }, + { "cs_CZ", BASE_cs_CZ }, + { "cy_GB", BASE_iso14651_t1 }, + { "da_DK", BASE_comm }, + { "de_AT", BASE_iso14651_t1 }, + { "de_BE", BASE_iso14651_t1 }, + { "de_CH", BASE_iso14651_t1 }, + { "de_DE", BASE_iso14651_t1 }, + { "de_LU", BASE_iso14651_t1 }, + { "el_GR", BASE_iso14651_t1 }, + { "en_AU", BASE_iso14651_t1 }, + { "en_BW", BASE_iso14651_t1 }, + { "en_CA", BASE_comm }, + { "en_DK", BASE_iso14651_t1 }, + { "en_GB", BASE_iso14651_t1 }, + { "en_HK", BASE_iso14651_t1 }, + { "en_IE", BASE_iso14651_t1 }, + { "en_IN", BASE_iso14651_t1 }, + { "en_NZ", BASE_iso14651_t1 }, + { "en_PH", BASE_iso14651_t1 }, + { "en_SG", BASE_iso14651_t1 }, + { "en_US", BASE_iso14651_t1 }, + { "en_ZA", BASE_iso14651_t1 }, + { "en_ZW", BASE_iso14651_t1 }, + { "eo_EO", BASE_iso14651_t1 }, + { "es_AR", BASE_comm }, + { "es_BO", BASE_comm }, + { "es_CL", BASE_comm }, + { "es_CO", BASE_comm }, + { "es_CR", BASE_comm }, + { "es_DO", BASE_comm }, + { "es_EC", BASE_comm }, + { "es_ES", BASE_comm }, + { "es_GT", BASE_comm }, + { "es_HN", BASE_comm }, + { "es_MX", BASE_comm }, + { "es_NI", BASE_comm }, + { "es_PA", BASE_comm }, + { "es_PE", BASE_comm }, + { "es_PR", BASE_comm }, + { "es_PY", BASE_comm }, + { "es_SV", BASE_comm }, + { "es_US", BASE_comm }, + { "es_UY", BASE_comm }, + { "es_VE", BASE_comm }, + { "et_EE", BASE_comm }, + { "eu_ES", BASE_iso14651_t1 }, + { "fa_IR", BASE_iso14651_t1 }, + { "fi_FI", BASE_comm }, + { "fo_FO", BASE_comm }, + { "fr_BE", BASE_iso14651_t1 }, + { "fr_CA", BASE_comm }, + { "fr_CH", BASE_iso14651_t1 }, + { "fr_FR", BASE_iso14651_t1 }, + { "fr_LU", BASE_iso14651_t1 }, + { "ga_IE", BASE_iso14651_t1 }, + { "gd_GB", BASE_iso14651_t1 }, + { "gl_ES", BASE_comm }, + { "gv_GB", BASE_iso14651_t1 }, + { "he_IL", BASE_iso14651_t1 }, + { "hi_IN", BASE_iso14651_t1 }, + { "hr_HR", BASE_comm }, + { "hu_HU", BASE_iso14651_t1 }, + { "hy_AM", BASE_iso14651_t1 }, + { "id_ID", BASE_iso14651_t1 }, + { "is_IS", BASE_comm }, + { "it_CH", BASE_iso14651_t1 }, + { "it_IT", BASE_iso14651_t1 }, + { "iw_IL", BASE_iso14651_t1 }, + { "ja_JP", BASE_ja_JP }, + { "ka_GE", BASE_iso14651_t1 }, + { "kl_GL", BASE_comm }, + { "ko_KR", BASE_ko_KR }, + { "kw_GB", BASE_iso14651_t1 }, + { "lt_LT", BASE_comm }, + { "lv_LV", BASE_comm }, + { "mi_NZ", BASE_iso14651_t1 }, + { "mk_MK", BASE_iso14651_t1 }, + { "mr_IN", BASE_iso14651_t1 }, + { "ms_MY", BASE_iso14651_t1 }, + { "mt_MT", BASE_iso14651_t1 }, + { "nl_BE", BASE_iso14651_t1 }, + { "nl_NL", BASE_iso14651_t1 }, + { "nn_NO", BASE_iso14651_t1 }, + { "no_NO", BASE_comm }, + { "oc_FR", BASE_iso14651_t1 }, + { "pl_PL", BASE_comm }, + { "pt_BR", BASE_iso14651_t1 }, + { "pt_PT", BASE_iso14651_t1 }, + { "ro_RO", BASE_iso14651_t1 }, + { "ru_RU", BASE_iso14651_t1 }, + { "ru_UA", BASE_iso14651_t1 }, + { "se_NO", BASE_iso14651_t1 }, + { "sk_SK", BASE_cs_CZ }, + { "sl_SI", BASE_comm }, + { "sq_AL", BASE_iso14651_t1 }, + { "sr_YU", BASE_iso14651_t1 }, + { "sv_FI", BASE_comm }, + { "sv_SE", BASE_iso14651_t1 }, + { "ta_IN", BASE_iso14651_t1 }, + { "te_IN", BASE_iso14651_t1 }, + { "tg_TJ", BASE_iso14651_t1 }, + { "th_TH", BASE_th_TH }, + { "ti_ER", BASE_iso14651_t1 }, + { "ti_ET", BASE_iso14651_t1 }, + { "tl_PH", BASE_iso14651_t1 }, + { "tr_TR", BASE_comm }, + { "tt_RU", BASE_iso14651_t1 }, + { "uk_UA", BASE_iso14651_t1 }, + { "ur_PK", BASE_iso14651_t1 }, + { "uz_UZ", BASE_iso14651_t1 }, + { "vi_VN", BASE_iso14651_t1 }, + { "wa_BE", BASE_iso14651_t1 }, + { "yi_US", BASE_iso14651_t1 }, + { "zh_CN", BASE_iso14651_t1 }, + { "zh_HK", BASE_iso14651_t1 }, + { "zh_SG", BASE_iso14651_t1 }, + { "zh_TW", BASE_iso14651_t1 }, +}; + + +static int der_count[BASE_MAX]; +static const char *new_args[500]; +static int new_arg_count; + +static int dep_cmp(const void *s1, const void *s2) +{ + return strcmp( (const char *) s1, ((const deps_t *) s2)->der_name); +} + +static int old_main(int argc, char **argv); + +int main(int argc, char **argv) +{ + const deps_t *p; + ll_item_t *lli; + int i; + int total; + + if (argc < 2) { + return EXIT_FAILURE; + } + + init_locale_list(); + + while (--argc) { + p = (const deps_t *) bsearch(*++argv, deps, sizeof(deps)/sizeof(deps[0]), sizeof(deps[0]), dep_cmp); + if (!p) { + if (!strcmp("C", *argv)) { + printf("ignoring C locale\n"); + continue; + } else { + printf("%s not found\n", *argv); + return EXIT_FAILURE; + } + } + + i = p->base_locale; + ++der_count[i]; + + if (!strcmp(base_name[i], *argv)) { + /* same name as base, so skip after count incremented */ + continue; + } + + /* add it to the list. the main body will catch duplicates */ + lli = (ll_item_t *) xmalloc(sizeof(ll_item_t)); + lli->prev = lli->next = NULL; + lli->data = (void *) *argv; + insque(lli, locale_list[i]); + } + + total = 0; + for (i=0 ; i < BASE_MAX ; i++) { +/* printf("der_count[%2d] = %3d\n", i, der_count[i]); */ + total += der_count[i]; + } +/* printf("total = %d\n", total); */ + + new_args[new_arg_count++] = "dummyprogramname"; + for (i=0 ; i < BASE_MAX ; i++) { + if (!der_count[i]) { + continue; + } + new_args[new_arg_count++] = (i == BASE_comm) ? "-c" : "-b"; + lli = locale_list[i]; + do { + new_args[new_arg_count++] = (const char *) (lli->data); + lli = lli->next; + } while (lli != locale_list[i]); + new_args[new_arg_count++] = "-f"; + } + +/* for (i=0 ; i < new_arg_count ; i++) { */ +/* printf("%3d: %s\n", i, new_args[i]); */ +/* } */ + + return old_main(new_arg_count, (char **) new_args); +} + + +/* usage... prog -b basefile derived {derived} -s single {single} */ + +static int old_main(int argc, char **argv) +{ + int next_is_base = 0; + int next_is_subset = 0; + + superset = 0; + + while (--argc) { + ++argv; + if (**argv == '-') { + if ((*argv)[1] == 'd') { + dump_weights((*argv) + 2); + } else if ((*argv)[1] == 'f') { /* dump all weight rules */ + finalize_base(); + } else if ((*argv)[1] == 'R') { /* dump all weight rules */ + twalk(root_weight, print_weight_node); + } else if (((*argv)[1] == 'c') && !(*argv)[2]) { /* new common subset */ + cur_base = cur_derived = NULL; + next_is_subset = 1; + next_is_base = 1; + superset = 0; + } else if (((*argv)[1] == 'b') && !(*argv)[2]) { /* new base locale */ + cur_base = cur_derived = NULL; + next_is_subset = 0; + next_is_base = 1; + superset = 0; + } else if (((*argv)[1] == 's') && !(*argv)[2]) { /* single locales follow */ + cur_base = cur_derived = NULL; + next_is_subset = 0; + next_is_base = 2; + superset = 0; + } else { + error_msg("unrecognized option %s", *argv); + } + continue; + } + /* new file */ + new_col_locale(*argv); /* automaticly sets cur_col */ + if (next_is_base) { + cur_base = cur_col; + } else { + cur_derived = cur_col; + } + pushfile(*argv); +/* fprintf(stderr, "processing file %s\n", *argv); */ + processfile(); /* this does a popfile */ + +/* twalk(cur_col->root_colitem, print_colnode); */ + + if (next_is_base == 1) { + next_is_base = 0; + } + if (next_is_subset) { + next_is_subset = 0; + superset = 1; + } + } + + fprintf(stderr, "success!\n"); + fprintf(stderr, +/* "num_sym=%zu mem_sym=%zu unique_weights=%zu\n", */ + "num_sym=%u mem_sym=%u unique_weights=%u\n", + num_sym, mem_sym, unique_weights); +/* twalk(root_weight, print_weight_node); */ + + fprintf(stderr, "num base locales = %d num derived locales = %d\n", + base_locale_len, der_locale_len); + + fprintf(stderr, + "override_len = %d multistart_len = %d weightstr_len = %d\n" + "wcs2colidt_len = %d index2weight_len = %d index2ruleidx_len = %d\n" + "ruletable_len = %d\n" + "total size is %d bytes or %d kB\n", + override_len, multistart_len, weightstr_len, + wcs2colidt_len, index2weight_len, index2ruleidx_len, + ruletable_len, +#warning mult by 2 for rule indecies + (override_len + multistart_len + weightstr_len + + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len) * 2, + (override_len + multistart_len + weightstr_len + + wcs2colidt_len + index2weight_len + index2ruleidx_len + ruletable_len + 511) / 512); + +#if 0 + { + int i; + + for (i=0 ; i < base_locale_len ; i++) { + dump_base_locale(i); + } + for (i=0 ; i < der_locale_len ; i++) { + dump_der_locale(i); + } + } +#endif + + { + FILE *fp = fopen("locale_collate.h", "w"); + + if (!fp) { + error_msg("couldn't open output file!"); + } + dump_collate(fp); + if (ferror(fp) || fclose(fp)) { + error_msg("write error or close error for output file!\n"); + } + } + + return EXIT_SUCCESS; +} + +static void error_msg(const char *fmt, ...) +{ + va_list arg; + + fprintf(stderr, "Error: "); + if (fno >= 0) { + fprintf(stderr, "file %s (%d): ", fname[fno], lineno[fno]); + } + va_start(arg, fmt); + vfprintf(stderr, fmt, arg); + va_end(arg); + fprintf(stderr, "\n"); + + exit(EXIT_FAILURE); +} + +static void pushfile(char *filename) +{ + static fbuf[PATH_MAX]; + + snprintf(fbuf, PATH_MAX, "collation/%s", filename); + + if (fno >= MAX_FNO) { + error_msg("file stack size exceeded"); + } + + if (!(fstack[++fno] = fopen(fbuf, "r"))) { + --fno; /* oops */ + error_msg("cannot open file %s", fbuf); + } + + fname[fno] = xsymdup(filename); + lineno[fno] = 0; +} + +static void popfile(void) +{ + if (fno < 0) { + error_msg("pop on empty file stack"); + } + +/* free(fname[fno]); */ + fclose(fstack[fno]); + --fno; +} + +static void eatwhitespace(void) +{ + while (isspace(*pos)) { + ++pos; + } +} + +static int iscommentchar(int c) +{ + return ((c == '#') || (c == '%')); +} + +static int next_line(void) +{ + size_t n; + char *s = linebuf; + + assert(fno >= 0); + + pos_e = NULL; + do { + if (fgets(s, sizeof(linebuf), fstack[fno]) != NULL) { + ++lineno[fno]; + n = strlen(linebuf); + if ((n == sizeof(linebuf) - 1) && (linebuf[n-1] != '\n')) { + /* Either line is too long or last line is very long with + * no trailing newline. But we'll always treat it as an + * errro. */ + error_msg("line too long?"); + } + + --n; + /* Be careful... last line doesn't need a newline. */ + if (linebuf[n] == '\n') { + linebuf[n--] = 0; /* trim trailing newline */ + } + + pos = linebuf; + eatwhitespace(); + if (*pos && !iscommentchar(*pos)) { /* not empty or comment line */ + return 1; /* got a line */ + } + } else { /* eof */ + popfile(); + } + } while (fno >= 0); + + return 0; +} + +static char *next_token(void) +{ + char *p; + +#if 0 + if (pos_e == NULL) { + return NULL + pos = pos_e; + *pos = end_of_token; + end_of_token = 0; + } +#else + if (pos_e != NULL) { + pos = pos_e; + *pos = end_of_token; + end_of_token = 0; + } +#endif + eatwhitespace(); + p = pos; + + if (!*p || iscommentchar(*p)) { /* end of line or start of comment */ + pos = pos_e = NULL; + *p = 0; /* treat comment as end of line */ +/* fprintf(stdout, "returning NUL token |%s|\n", pos); */ + return NULL; +#if 1 + } else if (*p == '<') { /* collating symbol, element, or value */ + while (*++p) { + if ((*p == '/') && p[1]) { + ++p; + continue; + } + if (*p == '>') { + pos_e = ++p; + end_of_token = *p; + *p = 0; +/* fprintf(stdout, "returning col token |%s|\n", pos); */ + return pos; + } + } + } else if (*p == '"') { /* collating element value? */ + while (*++p) { + if (*p == '"') { /* found the end of the quoted string */ + pos_e = ++p; + end_of_token = *p; + *p = 0; +/* fprintf(stdout, "returning quote token |%s|\n", pos); */ + return pos; + } + } +#endif + } else { /* some kind of keyword */ + while (*++p) { + if (isspace(*p) || (*p == ';')) { + break; + } + } + pos_e = p; + end_of_token = *p; + *p = 0; +/* fprintf(stdout, "returning key token |%s|\n", pos); */ + return pos; + } + + error_msg("illegal token |%s|", pos); +} + +static void *xmalloc(size_t n) +{ + void *p; + + if (!(p = malloc(n))) { + error_msg("OUT OF MEMORY"); + } + return p; +} + +static void do_copy(void) +{ + char *s; + char *e; + + if ((s = next_token()) != NULL) { + e = strchr(s + 1, '"'); + if ((*s == '"') && e && (*e == '"') && !e[1]) { + if (next_token() != NULL) { + error_msg("illegal trailing text: %s", pos); + } + *e = 0; + ++s; + if (cur_base && !strcmp(cur_base->name,s)) { +/* fprintf(stderr, "skipping copy of base file %s\n", s); */ +#warning need to update last in order and position or check + return; + } +/* fprintf(stderr, "full copy of %s\n", s); */ + pushfile(s); + return; + } + } + error_msg("illegal or missing arg for copy: %s", s); +} + +static void do_colsym(void) +{ + char *s; + char *e; + + if ((s = next_token()) != NULL) { + e = strrchr(s,'>'); + if ((*s == '<') && e && (*e == '>') && !e[1]) { + if (next_token() != NULL) { + error_msg("illegal trailing text: %s", pos); + } + e[1] = 0; /* cleanup in case next_token stored something */ + add_colitem(s,NULL); + return; + } + } + error_msg("illegal or missing arg for collating-symbol: %s", s); +} + +static void do_colele(void) +{ + char *s; + char *e; + char *s1; + char *e1; + int n; + + if ((s = next_token()) != NULL) { + e = strrchr(s,'>'); + if ((*s == '<') && e && (*e == '>') && !e[1]) { + if (((s1 = next_token()) == NULL) + || (strcmp(s1,"from") != 0) + || ((s1 = next_token()) == NULL) + || (*s1 != '\"') + ) { + error_msg("illegal format for collating-element spec"); + } + e1 = strchr(s1 + 1, '"'); + if ((*s1 != '"') || !e1 || (*e1 != '"') || (e1[1] != 0)) { + error_msg("illegal definition for collating-element: %s", s1); + } + if (next_token() != NULL) { + error_msg("illegal trailing text: %s", pos); + } + e[1] = 0; /* cleanup in case next_token stored something */ + e1[1] = 0; + add_colitem(s,s1); + ++s1; + if (!(n = is_ucode(s1))) { + error_msg("starting char must be a code: %s", s1); + } + assert(s1[n] == '<'); + s1[n] = 0; + s = xsymdup(s1); + if (!(tsearch(s, &cur_base->root_starter_char, sym_cmp))) { + error_msg("OUT OF MEMORY"); + } + + return; + } + } + error_msg("illegal or missing arg for collating-element: %s", s); +} + +static ll_item_t *find_section_list_item(const char *name, col_locale_t *loc) +{ + ll_item_t *p; + + if (!loc) { + return NULL; + } + + p = loc->section_list; + + while (p) { +#warning devel code +/* if (!((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER))) { */ +/* fprintf(stderr, "fsli = %d\n", p->data_type); */ +/* } */ + assert((p->data_type == DT_SECTION) || (p->data_type == DT_REORDER)); + if (!strcmp(name, ((section_t *)(p->data))->name)) { + break; + } + p = p->next; + } + return p; +} + +static ll_item_t *find_ll_last(ll_item_t *p) +{ + assert(p); + + while (p->next) { + p = p->next; + } + return p; +} + +static void do_script(void) +{ + char *s; + char *e; + + if ((s = next_token()) != NULL) { + e = strrchr(s,'>'); + if ((*s == '<') && e && (*e == '>') && !e[1]) { + if (next_token() != NULL) { + error_msg("illegal trailing text: %s", pos); + } + e[1] = 0; /* cleanup in case next_token stored something */ + add_script(s); + return; + } + } + error_msg("illegal or missing arg for script: %s", s); +} + +static col_locale_t *new_col_locale(char *name) +{ + ll_item_t *lli; + ll_item_t *lli2; + + cur_col = (col_locale_t *) xmalloc(sizeof(col_locale_t)); + cur_col->name = name; + cur_col->root_colitem = NULL; + cur_col->root_element = NULL; + cur_col->root_scripts = NULL; + cur_col->base_locale = NULL; + if (!superset) { + /* start with an anonymous section */ + cur_section = new_section(NULL); + cur_col->section_list = new_ll_item(DT_SECTION, cur_section); + } else { + /* start with a reorder section */ + cur_section = new_section("R"); + cur_num_weights = cur_section->num_rules + = ((section_t *)(cur_base->section_list->data))->num_rules; + memcpy(cur_rule, + ((section_t *)(cur_base->section_list->data))->rules, + MAX_COLLATION_WEIGHTS); + memcpy(cur_section->rules, + ((section_t *)(cur_base->section_list->data))->rules, + MAX_COLLATION_WEIGHTS); + cur_col->section_list = new_ll_item(DT_REORDER, cur_section); + assert(cur_base->section_list->next == NULL); /* currently only one section allowed */ + lli = ((section_t *)(cur_base->section_list->data))->itm_list; + assert(lli); + lli2 = new_ll_item(DT_REORDER, cur_section); + lli2->prev = lli2->next = lli2; + insque(lli2, lli->prev); + ((section_t *)(cur_base->section_list->data))->itm_list = lli2; + } +/* cur_col->section_list = NULL; */ +/* add_script(((section_t *)(cur_col->section_list->data))->name); */ + cur_col->root_wi_index = NULL; + cur_col->root_wi_index_reordered = NULL; + cur_col->root_derived_wi = NULL; + cur_col->derived_list = NULL; + cur_col->root_starter_char = NULL; + cur_col->root_starter_all = NULL; + cur_col->undefined_idx = NULL; + return cur_col; +} + +static int colitem_cmp(const void *n1, const void *n2) +{ + return strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string); +} + +static int colelement_cmp(const void *n1, const void *n2) +{ + int r; + + r = strcmp(((colitem_t *)n1)->string, ((colitem_t *)n2)->string); + if (!r) { + if (((colitem_t *)n1)->element && ((colitem_t *)n2)->element) { + r = strcmp(((colitem_t *)n1)->element, ((colitem_t *)n2)->element); + } else if (((colitem_t *)n1)->element == ((colitem_t *)n2)->element) { + r = 0; /* both null */ + } else { + r = (((colitem_t *)n1)->element == NULL) ? -1 : 1; + } + } + return r; +} + +static void del_colitem(colitem_t *p) +{ +/* free((void *) p->element); */ +/* free((void *) p->string); */ + free(p); +} + +static colitem_t *new_colitem(char *item, char *def) +{ + colitem_t *p; + + p = xmalloc(sizeof(colitem_t)); + p->string = xsymdup(item); + p->element = (!def) ? def : xsymdup(def); + + return p; +} + +static void add_colitem(char *item, char *def) +{ + colitem_t *p; + +#if 0 + printf("adding collation item %s", item); + if (def) { + printf(" with definition %s", def); + } + printf("\n"); +#endif + + p = new_colitem(item, def); + +#warning devel code + if (superset) { + if (tfind(p, &cur_base->root_colitem, colitem_cmp)) { +/* fprintf(stderr, "skipping superset duplicate collating item \"%s\"\n", p->string); */ + del_colitem(p); + return; +/* } else { */ +/* fprintf(stderr, "superset: new collating item \"%s\" = %s\n", p->string, p->element); */ + } + } + + if (cur_col == cur_derived) { + if (!tfind(p, &cur_base->root_colitem, colitem_cmp)) { + /* not in current but could be in base */ + if (!tsearch(p, &cur_base->root_colitem, colitem_cmp)) { + error_msg("OUT OF MEMORY!"); + } + } else if (!tfind(p, &cur_base->root_colitem, colelement_cmp)) { + error_msg("collating element/symbol mismatch: item=%s def=%s", item, def); + } + } + + + if (!tfind(p, &cur_col->root_colitem, colitem_cmp)) { + /* not in current but could be in base */ + if (!tsearch(p, &cur_col->root_colitem, colitem_cmp)) { + error_msg("OUT OF MEMORY!"); + } + } else if (!tfind(p, &cur_col->root_colitem, colelement_cmp)) { + error_msg("collating element/symbol mismatch"); + } else { /* already there */ + fprintf(stderr, "duplicate collating item \"%s\"\n", p->string); + del_colitem(p); + } +} + +/* add a script (section) to the current locale */ +static void add_script(const char *s) +{ + ll_item_t *l; + + /* make sure it isn't in base if working with derived */ + if (cur_base != cur_col) { + if (find_section_list_item(s, cur_base)) { + error_msg("attempt to add script %s for derived when already in base", s); + } + } + + if (find_section_list_item(s, cur_col)) { + error_msg("attempt to readd script %s", s); + } + + l = find_ll_last(cur_col->section_list); + insque(new_ll_item(DT_SECTION, new_section(s)), l); +} + +static const char str_forward[] = "forward"; +static const char str_backward[] = "backward"; +static const char str_position[] = "position"; + +static void do_order_start(void) +{ + const char *s; + char *e; + ll_item_t *l; + section_t *sect; + int rule; + + if (order_state & ~IN_ORDER) { + error_msg("order_start following reorder{_sections}_after"); + } + order_state |= IN_ORDER; + + if (superset) { + if (++superset_order_start_cnt > 1) { + error_msg("currently only a common order_start is supported in superset"); + } + return; + } + + if (!(s = next_token())) { + s = str_forward; /* if no args */ + } + + if (*s == '<') { /* section (script) */ + e = strrchr(s,'>'); + if ((*s == '<') && e && (*e == '>') && !e[1]) { + e[1] = 0; /* cleanup in case next_token stored something */ + + if (!(l = find_section_list_item(s, cur_col))) { + error_msg("ref of undefined sections: %s", s); + } + sect = (section_t *)(l->data); + if (sect->num_rules) { + error_msg("sections already defined: %s", s); + } + } else { + error_msg("illegal section ref: %s", s); + } + + if (!(s = next_token())) { + s = str_forward; /* if no args */ + } else if (*s != ';') { + error_msg("missing seperator!"); + } + } else { /* need an anonymous section */ + if ((*cur_section->name != '<') && (cur_section->num_items == 0)) { /* already in an empty anonymous section */ + sect = cur_section; + } else { + sect = new_section(NULL); + l = find_ll_last(cur_col->section_list); + insque(new_ll_item(DT_SECTION, new_section(s)), l); + } + sect->num_rules = 0; /* setting this below so nix default */ + } + cur_section = sect; + +#warning need to add section to weight list? + + /* now do rules */ + do { + rule = 0; + if (*s == ';') { + ++s; + } + while (*s) { + if (!strncmp(str_forward, s, 7)) { + rule |= R_FORWARD; + s += 7; + } else if (!strncmp(str_backward, s, 8)) { + rule |= R_BACKWARD; + s += 8; + } else if (!strncmp(str_position, s, 8)) { + rule |= R_POSITION; + s += 8; + } + + if (*s == ',') { + ++s; + continue; + } + + if (!*s || (*s == ';')) { + if (sect->num_rules >= MAX_COLLATION_WEIGHTS) { + error_msg("more than %d weight rules!", MAX_COLLATION_WEIGHTS); + } + if (!rule) { + error_msg("missing weight rule!"); + } + if ((rule & (R_FORWARD|R_BACKWARD|R_POSITION)) > R_BACKWARD) { + error_msg("backward paired with forward and/or position!"); + } + + sect->rules[sect->num_rules++] = rule; + rule = 0; + continue; + } + + error_msg("illegal weight rule: %s", s); + } + } while ((s = next_token()) != NULL); + + cur_section = sect; + +/* fprintf(stderr, "setting cur_num_weights to %d for %s\n", sect->num_rules, sect->name); */ + cur_num_weights = sect->num_rules; + memcpy(cur_rule, sect->rules, MAX_COLLATION_WEIGHTS); +} + +static void do_order_end(void) +{ + if (!(order_state & IN_ORDER)) { + error_msg("order_end with no matching order_start"); + } + order_state &= ~IN_ORDER; + + cur_section = new_section(NULL); +} + +static void do_reorder_after(void) +{ + char *t; + ll_item_t *lli; + const weight_t *w; + int save_cur_num_weights; + char save_cur_rule[MAX_COLLATION_WEIGHTS]; + + + if (order_state & ~IN_REORDER) { + error_msg("reorder_after following order_start or reorder_sections_after"); + } + order_state |= IN_REORDER; + + if (superset) { + error_msg("currently reorder_after is not supported in supersets"); + } + +#warning have to use rule for current section!!! + + if (!(t = next_token())) { + error_msg("missing arg for reorder_after"); + } + + t = xsymdup(t); + + if (next_token() != NULL) { + error_msg("trailing text reorder_after: %s", pos); + } + + if (cur_col == cur_base) { + error_msg("sorry.. reorder_after in base locale is not currently supported"); + } + + if (!(lli = find_wi_index(t, cur_base))) { + error_msg("reorder_after for non-base item currently not supported: %s", t); + } + + w = ((weighted_item_t *)(lli->data))->weight; + + + save_cur_num_weights = cur_num_weights; + memcpy(save_cur_rule, cur_rule, MAX_COLLATION_WEIGHTS); + + cur_section = new_section("R"); + insque(new_ll_item(DT_REORDER, cur_section), lli); + +#if 0 + + { + ll_item_t *l1; + ll_item_t *l2; + ll_item_t *l3; + l1 = new_ll_item(DT_REORDER, cur_section); + l2 = find_ll_last(cur_col->section_list); + insque(l1, l2); + l3 = find_ll_last(cur_col->section_list); + + fprintf(stderr, "reorder_after %p %p %p %s\n", l1, l2, l3, cur_section->name); + } +#else + insque(new_ll_item(DT_REORDER, cur_section), find_ll_last(cur_col->section_list)); +#endif + + cur_num_weights = cur_section->num_rules = save_cur_num_weights; + memcpy(cur_rule, save_cur_rule, MAX_COLLATION_WEIGHTS); + memcpy(cur_section->rules, save_cur_rule, MAX_COLLATION_WEIGHTS); + + +#warning devel code +/* fprintf(stderr, "reorder -- %s %d\n", ((weighted_item_t *)(lli->data))->symbol, w->num_weights); */ + +#warning hack to get around hu_HU reorder-after problem +/* if (!w->num_weights) { */ + +/* } else { */ +/* cur_num_weights = w->num_weights; */ +/* memcpy(cur_rule, w->rule, MAX_COLLATION_WEIGHTS); */ +/* } */ + +/* fprintf(stderr, "reorder_after succeeded for %s\n", t); */ +} + +static void do_reorder_end(void) +{ + if (!(order_state & IN_REORDER)) { + error_msg("reorder_end with no matching reorder_after"); + } + order_state &= ~IN_REORDER; +} + +static void do_reorder_sections_after(void) +{ + const char *t; + ll_item_t *lli; + + if (order_state & ~IN_REORDER_SECTIONS) { + error_msg("reorder_sections_after following order_start or reorder_after"); + } + order_state |= IN_REORDER_SECTIONS; + + if (superset) { + error_msg("currently reorder_sections_after is not supported in supersets"); + } + + if (!(t = next_token())) { + error_msg("missing arg for reorder_sections_after"); + } + + t = xsymdup(t); + + if (next_token() != NULL) { + error_msg("trailing text reorder_sections_after: %s", pos); + } + + if (cur_col == cur_base) { + error_msg("sorry.. reorder_sections_after in base locale is not currently supported"); + } + + lli = cur_base->section_list; + do { +/* fprintf(stderr, "hmm -- |%s|%d|\n", ((section_t *)(lli->data))->name, lli->data_type); */ + if (lli->data_type & DT_SECTION) { +/* fprintf(stderr, "checking |%s|%s|\n", ((section_t *)(lli->data))->name, t); */ + if (!strcmp(((section_t *)(lli->data))->name, t)) { + reorder_section_ptr = lli; + return; + } + } + lli = lli->next; + } while (lli); + + error_msg("reorder_sections_after for non-base item currently not supported: %s", t); +} + +static void do_reorder_sections_end(void) +{ + if (!(order_state & IN_REORDER_SECTIONS)) { + error_msg("reorder_sections_end with no matching reorder_sections_after"); + } + order_state &= ~IN_REORDER_SECTIONS; + + reorder_section_ptr = NULL; +} + +static ll_item_t *new_ll_item(int data_type, void *data) +{ + ll_item_t *p; + + p = xmalloc(sizeof(ll_item_t)); + p->next = p->prev = NULL; + p->data_type = data_type; + p->data = data; + p->idx = INT_MIN; + + return p; +} + +static int sym_cmp(const void *n1, const void *n2) +{ +/* fprintf(stderr, "sym_cmp: |%s| |%s|\n", (const char *)n1, (const char *)n2); */ + return strcmp((const char *) n1, (const char *) n2); +} + +static char *xsymdup(const char *s) +{ + void *p; + + if (!(p = tfind(s, &root_sym, sym_cmp))) { /* not a currently known symbol */ + if (!(s = strdup(s)) || !(p = tsearch(s, &root_sym, sym_cmp))) { + error_msg("OUT OF MEMORY!"); + } + ++num_sym; + mem_sym += strlen(s) + 1; +/* fprintf(stderr, "xsymdup: alloc |%s| %p |%s| %p\n", *(char **)p, p, s, s); */ +/* } else { */ +/* fprintf(stderr, "xsymdup: found |%s| %p\n", *(char **)p, p); */ + } + return *(char **) p; +} + +static int weight_cmp(const void *n1, const void *n2) +{ + const weight_t *w1 = (const weight_t *) n1; + const weight_t *w2 = (const weight_t *) n2; + int i, r; + + if (w1->num_weights != w2->num_weights) { + return w1->num_weights - w2->num_weights; + } + + for (i=0 ; i < w1->num_weights ; i++) { + if (w1->rule[i] != w2->rule[i]) { + return w1->rule[i] - w2->rule[i]; + } + if ((r = strcmp(w1->colitem[i], w2->colitem[i])) != 0) { + return r; + } + } + return 0; +} + +static weight_t *register_weight(weight_t *w) +{ + void *p; + + if (!(p = tfind(w, &root_weight, weight_cmp))) { /* new weight */ + p = xmalloc(sizeof(weight_t)); + memcpy(p, w, sizeof(weight_t)); + if (!(p = tsearch(p, &root_weight, weight_cmp))) { + error_msg("OUT OF MEMORY!"); + } + ++unique_weights; +/* } else { */ +/* fprintf(stderr, "rw: found\n"); */ + } + return *(weight_t **)p; +} + +static size_t ll_len(ll_item_t *l) +{ + size_t n = 0; + ll_item_t *p = l; + + while (p) { + ++n; + p = p->next; + if (p == l) { /* work for circular too */ + break; + } + } + return n; +} + +static size_t ll_count(ll_item_t *l, int mask) +{ + size_t n = 0; + ll_item_t *p = l; + + while (p) { + if (p->data_type & mask) { + ++n; + } + p = p->next; + if (p == l) { /* work for circular too */ + break; + } + } + return n; +} + + +static int wi_index_cmp(const void *n1, const void *n2) +{ + const char *s1 = ((weighted_item_t *)(((ll_item_t *) n1)->data))->symbol; + const char *s2 = ((weighted_item_t *)(((ll_item_t *) n2)->data))->symbol; + + return strcmp(s1, s2); +} + +static void add_wi_index(ll_item_t *l) +{ + assert(l->data_type == DT_WEIGHTED); + + if (!strcmp(((weighted_item_t *)(l->data))->symbol, "UNDEFINED")) { + cur_col->undefined_idx = l; + } + + if (!tfind(l, &cur_col->root_wi_index, wi_index_cmp)) { /* new wi_index */ + if (!tsearch(l, &cur_col->root_wi_index, wi_index_cmp)) { + error_msg("OUT OF MEMORY!"); + } + } + + if (cur_base != cur_col) { + if (!tfind(l, &cur_base->root_wi_index, wi_index_cmp)) {/* not a base val */ +/* printf("derived: %s\n", ((weighted_item_t *)(l->data))->symbol); */ + if (!tfind(l, &cur_base->root_derived_wi, wi_index_cmp)) { /* new derived */ + if (!tsearch(l, &cur_base->root_derived_wi, wi_index_cmp)) { + error_msg("OUT OF MEMORY!"); + } + } + } + } +} + +static int final_index; + + +static int is_ucode(const char *s) +{ + if ((s[0] == '<') + && (s[1] == 'U') + && isxdigit(s[2]) + && isxdigit(s[3]) + && isxdigit(s[4]) + && isxdigit(s[5]) + && (s[6] == '>') + ) { + return 7; + } else { + return 0; + } +} + +static void add_final_col_index(const char *s) +{ + ENTRY e; + + e.key = (char *) s; + e.data = (void *)(final_index); + if (!hsearch(e, FIND)) { /* not in the table */ + if (!hsearch(e, ENTER)) { + error_msg("OUT OF MEMORY! (hsearch)"); + } +#if 0 + { + int n; + void *v; + colitem_t ci; + colitem_t *p; + const char *t; + + if (!strcmp(s, "UNDEFINED")) { + printf("%6d: %s\n", final_index, s); + } else { + assert(*s == '<'); + if ((n = is_ucode(s)) != 0) { + assert(!s[n]); + printf("%6d: %s\n", final_index, s); + } else { + ci.string = (char *) s; + ci.element = NULL; /* don't care */ + v = tfind(&ci, &cur_base->root_colitem, colitem_cmp); + if (!v) { + fprintf(stderr, "%s NOT DEFINED!!!\n", s); + } else { + p = *((colitem_t **) v); + if (p->element != NULL) { + t = p->element; + assert(*t == '"'); + ++t; + n = is_ucode(t); + assert(n); + printf("%6d: %.*s | ", final_index, n, t); + do { + t += n; + assert(*t); + if (*t == '"') { + assert(!t[1]); + break; + } + n = is_ucode(t); + assert(n); + printf("%.*s", n, t); + } while (1); + printf(" collating-element %s\n", s); + } else { + printf("%6d: %s (collating-symbol)\n", final_index, s); + } + } + } + } + } +#endif + ++final_index; + } + +} + +static int final_index_val0(const char *s) +{ + ENTRY *p; + ENTRY e; + e.key = (char *) s; + + if (!(p = hsearch(e, FIND))) { /* not in the table */ + return 0; + } + + return (int)(p->data); +} + +static int final_index_val(const char *s) +{ + ENTRY *p; + ENTRY e; + e.key = (char *) s; + + if (!(p = hsearch(e, FIND))) { /* not in the table */ + error_msg("can't find final index: %s", s); + } + + return (int)(p->data); +} + +static size_t num_tree_nodes; + +static void count_nodes(const void *ptr, VISIT order, int level) +{ + if ((order == postorder) || (order == leaf)) { + ++num_tree_nodes; + } +} + +static size_t tnumnodes(const void *root) +{ + num_tree_nodes = 0; + + twalk(root, count_nodes); + + return num_tree_nodes; + +} + +static ll_item_t *find_wi_index(const char *sym, col_locale_t *cl) +{ + weighted_item_t w; + ll_item_t l; + void *p; + + w.symbol = sym; + l.data = &w; + l.data_type = DT_WEIGHTED; + + p = tfind(&l, &cl->root_wi_index, wi_index_cmp); + + if (p) { + p = *(ll_item_t **)p; + } + + return (ll_item_t *) p; +} + +static void mark_reordered(const char *sym) +{ + ll_item_t *lli; + + lli = find_wi_index(sym, cur_base); + + if (lli) { + if (!tsearch(lli, &cur_base->root_wi_index_reordered, wi_index_cmp)) { + error_msg("OUT OF MEMORY!"); + } + } +} + +static ll_item_t *find_wi_index_reordered(const char *sym) +{ + weighted_item_t w; + ll_item_t l; + void *p; + + w.symbol = sym; + l.data = &w; + l.data_type = DT_WEIGHTED; + + p = tfind(&l, &cur_base->root_wi_index_reordered, wi_index_cmp); + + if (p) { + p = *(ll_item_t **)p; + } + + return (ll_item_t *) p; +} + +static ll_item_t *init_comm_ptr(void) +{ + assert(cur_base); + assert(cur_base->section_list); + /* at the moment, only support one section in comm */ + assert(cur_base->section_list->next == NULL); + + comm_cur_ptr = ((section_t *)(cur_base->section_list->data))->itm_list; + + while (comm_cur_ptr && (comm_cur_ptr->data_type & DT_REORDER)) { + comm_cur_ptr = comm_cur_ptr->next; + } + +#warning devel code +/* { */ +/* ll_item_t *p = comm_cur_ptr; */ +/* fprintf(stderr, "init_comm_ptr\n"); */ + +/* while (p != comm_cur_ptr) { */ +/* if (p->data_type & DT_WEIGHTED) { */ +/* fprintf(stderr, "%s", ((weighted_item_t *)p)->symbol); */ +/* } */ +/* p = p->next; */ +/* } */ +/* } */ + + assert(comm_cur_ptr); + +/* fprintf(stderr, "init_comm_ptr -- %s %p %p %p %d\n", */ +/* ((weighted_item_t *)(comm_cur_ptr->data))->symbol, */ +/* comm_cur_ptr, comm_cur_ptr->prev, comm_cur_ptr->next, */ +/* ll_len(comm_cur_ptr)); */ + + comm_prev_ptr = NULL; + return comm_cur_ptr; +} + +static ll_item_t *next_comm_ptr(void) +{ + /* at the moment, only support one section in comm */ + assert(cur_base->section_list->next == NULL); + + comm_prev_ptr = comm_cur_ptr; + + while (comm_cur_ptr && ((comm_cur_ptr = comm_cur_ptr->next) != NULL)) { + if (!(comm_cur_ptr->data_type & DT_REORDER)) { + break; + } + } + + return comm_cur_ptr; +} + +static int dump_count; + +#if 0 +static void dump_section(section_t *s, int mask, col_locale_t *der) +{ + ll_item_t *lli; + ll_item_t *lli0; + weighted_item_t *w; + weight_t *p; + int i; + + lli0 = lli = s->itm_list; + + if (!lli0) { + return; + } + + do { + if (!(lli->data_type & mask)) { + lli = lli->next; + continue; + } + if (lli->data_type & DT_WEIGHTED) { + ++dump_count; + w = (weighted_item_t *)(lli->data); + p = w->weight; + printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights); + for (i = 0 ; i < p->num_weights ; i++) { + if (p->rule[i] & R_FORWARD) { + printf("F"); + } + if (p->rule[i] & R_BACKWARD) { + printf("B"); + } + if (p->rule[i] & R_POSITION) { + printf("P"); + } + printf(","); + } + for (i = 0 ; i < p->num_weights ; i++) { + printf(" %s", p->colitem[i]); + } + printf("\n"); + } else if (lli->data_type & (DT_SECTION|DT_REORDER)) { + + if (lli->data_type == DT_REORDER) { + assert(der); + if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) { + lli = lli->next; + continue; + } + } + + if (lli->data_type & DT_SECTION) { + printf("SECTION -----------------\n"); + } else { + printf("REORDER -----------------\n"); + } + + dump_section((section_t *)(lli->data), mask, der); + printf("DONE --------------------\n"); + } + lli = lli->next; + } while (lli != lli0); +} +#else +static int in_reorder_section = 0; + +static void dump_section(section_t *s, int mask, col_locale_t *der) +{ + ll_item_t *lli; + ll_item_t *lli0; + weighted_item_t *w; + weight_t *p; + int i; + + lli0 = lli = s->itm_list; + + if (!lli0) { + return; + } + + do { + if (!(lli->data_type & mask)) { + lli = lli->next; + continue; + } + if (lli->data_type & DT_WEIGHTED) { + ++dump_count; + w = (weighted_item_t *)(lli->data); + p = w->weight; +#if 1 + if (in_reorder_section) { + printf(" %p", w); + } +#else + printf("%6d: %s (%d) ", dump_count, w->symbol, p->num_weights); + for (i = 0 ; i < p->num_weights ; i++) { + if (p->rule[i] & R_FORWARD) { + printf("F"); + } + if (p->rule[i] & R_BACKWARD) { + printf("B"); + } + if (p->rule[i] & R_POSITION) { + printf("P"); + } + printf(","); + } + for (i = 0 ; i < p->num_weights ; i++) { + printf(" %s", p->colitem[i]); + } + printf("\n"); +#endif + } else if (lli->data_type & (DT_SECTION|DT_REORDER)) { + + if (lli->data_type == DT_REORDER) { + assert(der); + if (strncmp(((section_t *)(lli->data))->name, der->name, strlen(der->name))) { + lli = lli->next; + continue; + } + } + + if (lli->data_type & DT_SECTION) { +/* printf("SECTION -----------------\n"); */ + assert(0); + } else { +/* printf("REORDER -----------------\n"); */ + in_reorder_section = 1; + } + + dump_section((section_t *)(lli->data), mask, der); +/* printf("DONE --------------------\n"); */ + printf("\n"); + in_reorder_section = 0; + } + lli = lli->next; + } while (lli != lli0); +} +#endif + +static void dump_weights(const char *name) +{ + ll_item_t *lli; + col_locale_t *base; + col_locale_t *der; + col_locale_t cl; + void *p; + + assert(name); + + if (!*name) { /* use last */ + base = cur_base; + der = cur_derived; + } else { + cl.name = (char *) name; + if (!(p = tfind(&cl, &root_col_locale, col_locale_cmp))) { + error_msg("unknown locale: %s", name); + } + base = *((col_locale_t **) p); + der = NULL; + if (base->base_locale) { /* oops... really derived */ + der = base; + base = der->base_locale; + } + } + + dump_count = 0; + + if (base) { +/* printf("BASE - %s\n", base->name); */ + for (lli = base->section_list ; lli ; lli = lli->next) { +/* printf("SECTION %s\n", ((section_t *)(lli->data))->name); */ + dump_section((section_t *)(lli->data), ~0, der); + } + } + + assert(der != base); + + if (der) { +/* printf("DERIVED - %s\n", der->name); */ + for (lli = der->section_list ; lli ; lli = lli->next) { + if (lli->data_type == DT_SECTION) { + dump_section((section_t *)(lli->data), DT_WEIGHTED, der); + } + } + } +/* printf("DONE\n"); */ +} + +static void print_starter_node(const void *ptr, VISIT order, int level) +{ + if (order == postorder || order == leaf) { + fprintf(stderr, " %s\n", *(const char **) ptr); + } +} + +static void finalize_base(void) +{ + ll_item_t *s; + ll_item_t *h; + ll_item_t *lli; + ll_item_t *h2; + ll_item_t *l2; + ll_item_t *cli; + ll_item_t *rli = NULL; + weighted_item_t *w; + weight_t *p; + int i, n, mr, r, mi; + col_locale_t *cl; + void *mm; + + int num_invariant = 0; + int num_varying = 0; + int max_weight; + int index2weight_len_inc = 1; + + assert(cur_base); + assert(base_locale_len+1 < BASE_LOCALE_LEN); + + base_locale_array[base_locale_len].name = cur_base->name; + base_locale_array[base_locale_len].num_weights = 1; + base_locale_array[base_locale_len].index2weight_offset = index2weight_len; + base_locale_array[base_locale_len].index2ruleidx_offset = index2ruleidx_len; + if (!strcmp(cur_base->name,"ja_JP") || !strcmp(cur_base->name,"ko_KR")) { +#warning fix the index2weight check!! + index2weight_len_inc = 0; + } +/* printf("%s -- index2weight_len = %d\n", cur_base->name, index2weight_len); */ + + if (!hcreate(30000)) { + error_msg("OUT OF MEMORY!"); + } + + /* first pass ... set the fixed indexes */ + final_index = i = 1; + mr = 0; + for (s = cur_base->section_list ; s ; s = s->next) { +#if 1 + if (s->data_type & DT_REORDER) { /* a reordered section */ + fprintf(stderr, "pass1: reordered section %s - xxx\n", ((section_t *)(s->data))->name); + lli = ((section_t *)(s->data))->itm_list; + r = 0; + if (lli) { +/* r = ll_len( ((section_t *)(lli->data))->itm_list ); */ + r = ll_len(lli) + 1; + } + if (r > mr) { + mr = r; + } + fprintf(stderr, "pass1: reordered section %s - %d\n", ((section_t *)(s->data))->name, r); + continue; + } +#endif + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + if (lli->data_type & DT_RANGE) { + i += mr; + mr = 0; +#warning check ko_kR and 9 +/* ++i; */ + lli->idx = i; + assert(!rli); + rli = lli; + fprintf(stderr, "range pre = %d after = ", i); + i += ((range_item_t *)(lli->data))->length + 1; +#warning check ko_kR and 9 +/* ++i; */ + fprintf(stderr, "%d\n", i); + if (!index2weight_len_inc) { /* ko_KR hack */ + final_index += ((range_item_t *)(lli->data))->length + 1; + } +/* add_final_col_index("RANGE"); */ + } else if (lli->data_type & DT_WEIGHTED) { + i += mr; + mr = 0; + w = (weighted_item_t *)(lli->data); + if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ + ++num_varying; + ++i; + continue; + } + ++num_invariant; + index2weight_buffer[index2weight_len] = lli->idx = i++; + index2weight_len += index2weight_len_inc; + add_final_col_index(w->symbol); + + } else { + assert(lli->data_type & DT_REORDER); + r = ll_len( ((section_t *)(lli->data))->itm_list ); +#warning check ko_kR and 9 + if (r > mr) { + mr = r; + } +/* r = 0; */ + } + } while ((lli = lli->next) != h); + } + + /* second pass ... set the reordered indexes */ + mi = i + mr; + mr = i = 0; + for (s = cur_base->section_list ; s ; s = s->next) { + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + if (lli->data_type & DT_RANGE) { + i += mr; + mr = 0; + i = lli->idx + ((range_item_t *)(lli->data))->length + 1; +#warning check + } else if ((lli->data_type & DT_WEIGHTED) && !(s->data_type & DT_REORDER)) { + i += mr; + mr = 0; + w = (weighted_item_t *)(lli->data); + if (find_wi_index_reordered(w->symbol) /* reordered symbol skipped on first pass */ +#if 0 + || (s->data_type & DT_REORDER) /* or in a reordered section */ +#endif + ) { + assert(!(s->data_type & DT_REORDER)); + index2weight_buffer[index2weight_len] = lli->idx = ++i; + index2weight_len += index2weight_len_inc; + add_final_col_index(w->symbol); + +/* fprintf(stdout, "%11s: r %6d %6d %s\n", */ +/* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ + + continue; + } + i = lli->idx; + +/* fprintf(stdout, "%11s: w %6d %6d %s\n", */ +/* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ + + } else { +/* fprintf(stderr, "section: %s %d %d\n", ((section_t *)(s->data))->name, */ +/* s->data_type, lli->data_type); */ +/* assert(!(s->data_type & DT_REORDER)); */ +/* assert(lli->data_type & DT_REORDER); */ +#if 1 + if (s->data_type & DT_REORDER) { + h2 = l2 = lli; + if (!h2) { + continue; + } + } else { + assert(s->data_type & DT_SECTION); + h2 = l2 = ((section_t *)(lli->data))->itm_list; + if (!h2) { + continue; + } + } + + +#else + h2 = l2 = ((section_t *)(lli->data))->itm_list; + if (!h2) { + continue; + } +#endif + r = 0; + do { + assert(l2->data_type & DT_WEIGHTED); + ++r; + l2->idx = i + r; + +/* fprintf(stdout, "%s: R %6d %s\n", */ +/* ((section_t *)(lli->data))->name, l2->idx, ((weighted_item_t *)(l2->data))->symbol); */ + + } while ((l2 = l2->next) != h2); + if (r > mr) { + mr = r; + } + } + } while ((lli = lli->next) != h); + } + + /* finally, walk through all derived locales and set non-reordered section items */ + mr = mi; + for (cli = cur_base->derived_list ; cli ; cli = cli->next) { + cl = (col_locale_t *)(cli->data); +/* fprintf(stderr, "pass3: %d %s\n", cli->data_type, cl->name); */ + +/* fprintf(stdout, "pass3: %d %s\n", cli->data_type, cl->name); */ + + assert(cli->data_type == DT_COL_LOCALE); + + i = mi; + for (s = cl->section_list ; s ; s = s->next) { +/* if (s->data_type & DT_REORDER) { */ +/* continue; */ +/* } */ + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + assert(!(lli->data_type & DT_RANGE)); + if (lli->data_type & DT_WEIGHTED) { +/* fprintf(stderr, " %d %d %s\n", lli->data_type, lli->idx, ((weighted_item_t *)(lli->data))->symbol); */ + add_final_col_index(((weighted_item_t *)(lli->data))->symbol); + if (s->data_type & DT_REORDER) { + continue; + } + assert(lli->idx == INT_MIN); + lli->idx = ++i; + +/* fprintf(stdout, "%11s: S %6d %6d %s\n", */ +/* cl->name, lli->idx, */ +/* final_index_val(((weighted_item_t *)(lli->data))->symbol), */ +/* ((weighted_item_t *)(lli->data))->symbol); */ + + } else { + assert(0); + assert(lli->data_type & DT_SECTION); + + h2 = l2 = ((section_t *)(lli->data))->itm_list; + if (!h2) { + continue; + } + do { + assert(l2->data_type & DT_WEIGHTED); + assert(l2->idx == INT_MIN); + l2->idx = ++i; + add_final_col_index(((weighted_item_t *)(l2->data))->symbol); + } while ((l2 = l2->next) != h2); + } + } while ((lli = lli->next) != h); + } + if (i > mr) { + mr = i; + } + } + max_weight = mr; + + assert(num_varying == tnumnodes(cur_base->root_wi_index_reordered)); + + /* we can now initialize the wcs2index array */ + { + ENTRY *p; + ENTRY e; + char buf[8]; + static const char xd[] = "0123456789ABCDEF"; + int starter_index = final_index; + int wcs2index_count = 0; + + strcpy(buf, ""); + memset(wcs2index, 0, sizeof(wcs2index)); + e.key = (char *) buf; + for (i=1 ; i <= 0xffff ; i++) { + buf[5] = xd[ i & 0xf ]; + buf[4] = xd[ (i >> 4) & 0xf ]; + buf[3] = xd[ (i >> 8) & 0xf ]; + buf[2] = xd[ (i >> 12) & 0xf ]; + + if ((p = hsearch(e, FIND)) != NULL) { + ++wcs2index_count; + if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { + wcs2index[i] = ++starter_index; +/* fprintf(stderr, "wcs2index[ %#06x ] = %d (starter)\n", i, wcs2index[i]); */ + } else { + wcs2index[i] = (int)(p->data); +/* fprintf(stderr, "wcs2index[ %#06x ] = %d\n", i, wcs2index[i]); */ + } + } else { + if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { + error_msg("marked starter but not in hash: %s", buf); + } + } + } + + + /* ---------------------------------------------------------------------- */ + { + int i, n; + table_data table; + size_t t, smallest; + + n = 0; + smallest = SIZE_MAX; + table.ii = NULL; + for (i=0 ; i < 14 ; i++) { + if ((RANGE >> i) < 4) { + break; + } + t = newopt(wcs2index, RANGE, i, &table); + if (smallest >= t) { + n = i; + smallest = t; + /* } else { */ + /* break; */ + } + } + + +/* printf("smallest = %u for range %#x (%u)\n", smallest, RANGE, RANGE); */ + assert(smallest != SIZE_MAX); + if (smallest + wcs2colidt_len >= WCS2COLIDT_LEN) { + error_msg("WCS2COLIDT_LEN too small"); + } + base_locale_array[base_locale_len].wcs2colidt_offset = wcs2colidt_len; + table.ii = wcs2colidt_buffer + wcs2colidt_len; + t = smallest; + smallest = SIZE_MAX; + smallest = newopt(wcs2index, RANGE, n, &table); + assert(t == smallest); + wcs2colidt_len += smallest; +/* fprintf(stderr, "smallest = %d wcs2colidt_len = %d\n", smallest, wcs2colidt_len); */ + +#if 0 + { + unsigned int sc, n, i0, i1; + unsigned int u = 0xe40; + table_data *tbl = &table; + +#define WCctype_TI_MASK ((1 << tbl->ti_shift)-1) +#define WCctype_TI_SHIFT (tbl->ti_shift) +#define WCctype_TI_LEN (tbl->ti_len) +#define WCctype_II_MASK ((1 << tbl->ii_shift)-1) +#define WCctype_II_SHIFT (tbl->ii_shift) +#define WCctype_II_LEN (tbl->ii_len) + + sc = u & WCctype_TI_MASK; + u >>= WCctype_TI_SHIFT; + n = u & WCctype_II_MASK; + u >>= WCctype_II_SHIFT; + + i0 = tbl->ii[u]; + fprintf(stderr, "i0 = %d\n", i0); + i0 <<= WCctype_II_SHIFT; + i1 = tbl->ii[WCctype_II_LEN + i0 + n]; + /* i1 = tbl->ti[i0 + n]; */ + fprintf(stderr, "i1 = %d\n", i1); + i1 <<= WCctype_TI_SHIFT; + /* return *(uint16_t *)(&(tbl->ii[WCctype_II_LEN + WCctype_TI_LEN + i1 + sc])); */ + fprintf(stderr, "i2 = %d\n", WCctype_II_LEN + WCctype_TI_LEN + i1 + sc); + fprintf(stderr, "val = %d\n", tbl->ii[WCctype_II_LEN + WCctype_TI_LEN + i1 + sc]); + /* return tbl->ut[i1 + sc]; */ + + + } +#endif + base_locale_array[base_locale_len].ii_shift = table.ii_shift; + base_locale_array[base_locale_len].ti_shift = table.ti_shift; + base_locale_array[base_locale_len].ii_len = table.ii_len; + base_locale_array[base_locale_len].ti_len = table.ti_len; + } + /* ---------------------------------------------------------------------- */ + + base_locale_array[base_locale_len].num_col_base = num_invariant + num_varying; + base_locale_array[base_locale_len].max_col_index = final_index; + base_locale_array[base_locale_len].max_weight = max_weight; + + fprintf(stderr, "%s: %6u invariant %6u varying %6u derived %6u total %6u max weight %6u wcs2\n", + cur_base->name, num_invariant, num_varying, + tnumnodes(cur_base->root_derived_wi), final_index, max_weight, + wcs2index_count); + + } + +#if 1 + /* ok, now we need to dump out the base and derived tables... */ + /* don't forget to break up collating elements!!! */ + +/* fprintf(stdout, "**************************************************\n"); */ + /* first pass ... set the invariants */ + for (s = cur_base->section_list ; s ; s = s->next) { +#if 1 + if (s->data_type & DT_REORDER) { + fprintf(stderr, "1: skipping reordered section %s\n", ((section_t *)(s->data))->name); + continue; + } +#endif + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + if (lli->data_type & DT_WEIGHTED) { + w = (weighted_item_t *)(lli->data); + if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ + continue; + } + if (index2weight_len_inc) { + index2ruleidx_buffer[index2ruleidx_len++] = + add_rule((weighted_item_t *)(lli->data)); + } +/* fprintf(stdout, "%11s: w %6d %6d %s\n", */ +/* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ + } + } while ((lli = lli->next) != h); + } + + /* second pass ... set varying */ + for (s = cur_base->section_list ; s ; s = s->next) { +#if 1 + if (s->data_type & DT_REORDER) { + fprintf(stderr, "2: skipping reordered section %s\n", ((section_t *)(s->data))->name); + continue; + } +#endif + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + if (lli->data_type & DT_WEIGHTED) { + w = (weighted_item_t *)(lli->data); + if (find_wi_index_reordered(w->symbol)) { /* reordered symbol so skip on first pass */ + if (index2weight_len_inc) { + index2ruleidx_buffer[index2ruleidx_len++] = + add_rule((weighted_item_t *)(lli->data)); + } +/* fprintf(stdout, "%11s: r %6d %6d %s\n", */ +/* cur_base->name, lli->idx, final_index_val(w->symbol), w->symbol); */ + continue; + } + } + } while ((lli = lli->next) != h); + } + + do_starter_lists(cur_base); + + +/* fprintf(stderr,"updated final_index = %d\n", final_index); */ + + if (rli) { + base_locale_array[base_locale_len].range_low + = strtoul(((range_item_t *)(rli->data))->symbol1 + 2, NULL, 16); + base_locale_array[base_locale_len].range_count + = ((range_item_t *)(rli->data))->length; + base_locale_array[base_locale_len].range_base_weight = rli->idx; + base_locale_array[base_locale_len].range_rule_offset = add_range_rule((range_item_t *)(rli->data)); +/* fprintf(stdout, "%11s: %6d %6d %s %s (%d)\n", */ +/* "RANGE", rli->idx, -1, */ +/* ((range_item_t *)(rli->data))->symbol1, */ +/* ((range_item_t *)(rli->data))->symbol2, */ +/* ((range_item_t *)(rli->data))->length); */ + } + +/* fprintf(stdout,"\nDerived\n\n"); */ + + /* first, if base name is of the form ll_CC, add a derived locale for it */ + if ((strlen(cur_base->name) == 5) + && islower(cur_base->name[0]) + && islower(cur_base->name[1]) + && (cur_base->name[2] == '_') + && isupper(cur_base->name[3]) + && isupper(cur_base->name[4]) + ) { + + fprintf(stderr, "adding special derived for %s\n", cur_base->name); +/* fprintf(stderr,"updated final_index = %d\n", final_index); */ + + + assert(der_locale_len+1 < DER_LOCALE_LEN); + + der_locale_array[der_locale_len].name = cur_base->name; + der_locale_array[der_locale_len].base_idx = base_locale_len; + + u16_buf[0] = 1; + u16_buf[1] = 0; + u16_buf_len = 2; + + mm = NULL; + if ((u16_buf_len > override_len) || + !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]), + u16_buf, u16_buf_len*sizeof(u16_buf[0]))) + ) { + assert(override_len + u16_buf_len < OVERRIDE_LEN); + memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); + der_locale_array[der_locale_len].overrides_offset = override_len; + override_len += u16_buf_len; +/* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */ + } else if (!(u16_buf_len > override_len)) { + assert(mm); + der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer; +/* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ + } + der_locale_array[der_locale_len].multistart_offset + = base_locale_array[base_locale_len].multistart_offset; + der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED"); + + if (!der_locale_array[der_locale_len].undefined_idx) { + error_msg("no UNDEFINED definition for %s", cur_base->name); + } + + ++der_locale_len; + } else { + fprintf(stderr, "NOT adding special derived for %s\n", cur_base->name); + } + + /* now all the derived... */ + for (cli = cur_base->derived_list ; cli ; cli = cli->next) { + cl = (col_locale_t *)(cli->data); + assert(cli->data_type == DT_COL_LOCALE); + + assert(der_locale_len+1 < DER_LOCALE_LEN); + + der_locale_array[der_locale_len].name = cl->name; + der_locale_array[der_locale_len].base_idx = base_locale_len; + + u16_buf_len = 0; + + for (i = 0 ; i < 2 ; i++) { + if (i) { +/* fprintf(stdout, " section --- (singles)\n"); */ + u16_buf[u16_buf_len++] = 1; /* single */ + } + /* we do this in two passes... first all sequences, then all single reorders */ + for (s = cl->section_list ; s ; s = s->next) { + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); + if ((!i && (ll_len(h) > 1) ) || (ll_len(h) == i)) { + if (!i) { +/* fprintf(stdout, " section ----------------- %d %d\n", i, ll_len(h)); */ + u16_buf[u16_buf_len++] = ll_len(h); /* multi */ + assert(lli->data_type & DT_WEIGHTED); +#if 0 + u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); /* start index */ +#endif + u16_buf[u16_buf_len++] = lli->idx; /* start weight */ + } + do { + if (lli->data_type & DT_WEIGHTED) { +/* fprintf(stdout, "%11s: S %6d %6d %s\n", */ +/* cl->name, lli->idx, */ +/* final_index_val(((weighted_item_t *)(lli->data))->symbol), */ +/* ((weighted_item_t *)(lli->data))->symbol); */ +#if 0 + if (i) { + assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); + u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); + assert(u16_buf[u16_buf_len-1]); + u16_buf[u16_buf_len++] = lli->idx; /* weight */ + } +#else + assert(u16_buf_len +4 < sizeof(u16_buf)/sizeof(u16_buf[0])); + u16_buf[u16_buf_len++] = final_index_val(((weighted_item_t *)(lli->data))->symbol); + assert(u16_buf[u16_buf_len-1]); + if (i) { + u16_buf[u16_buf_len++] = lli->idx; /* weight */ + } +#endif + u16_buf[u16_buf_len++] = add_rule((weighted_item_t *)(lli->data)); + + } + } while ((lli = lli->next) != h); + } + } + } + u16_buf[u16_buf_len++] = 0; + + mm = NULL; + if ((u16_buf_len > override_len) || + !(mm = memmem(override_buffer, override_len*sizeof(override_buffer[0]), + u16_buf, u16_buf_len*sizeof(u16_buf[0]))) + ) { + assert(override_len + u16_buf_len < OVERRIDE_LEN); + memcpy(override_buffer + override_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); + der_locale_array[der_locale_len].overrides_offset = override_len; + override_len += u16_buf_len; +/* printf("%s: override_len = %d u16_buf_len = %d\n", cl->name, override_len, u16_buf_len); */ + } else if (!(u16_buf_len > override_len)) { + assert(mm); + der_locale_array[der_locale_len].overrides_offset = ((uint16_t *)(mm)) - override_buffer; +/* printf("%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ + } + + do_starter_lists(cl); + + der_locale_array[der_locale_len].undefined_idx = final_index_val0("UNDEFINED"); +#if 0 + assert(der_locale_array[der_locale_len].undefined_idx); + if (!der_locale_array[der_locale_len].undefined_idx) { + der_locale_array[der_locale_len].undefined_idx = base_locale_array[base_locale_len].undefined_idx; + } +#endif + + if (!der_locale_array[der_locale_len].undefined_idx) { + error_msg("no UNDEFINED definition for %s", cl->name); + } + + ++der_locale_len; + } + +#endif + +#warning handle UNDEFINED idx specially? what if in only some of derived? +/* base_locale_array[base_locale_len].undefined_idx = final_index_val0("UNDEFINED"); */ + base_locale_array[base_locale_len].undefined_idx = 0; + + + hdestroy(); + + ++base_locale_len; + +/* if (tnumnodes(cur_base->root_starter_char)) { */ +/* fprintf(stderr, "starter nodes\n"); */ +/* twalk(cur_base->root_starter_char, print_starter_node); */ +/* } */ +} + +static int starter_all_cmp(const void *n1, const void *n2) +{ + const char *s1 = ((weighted_item_t *) n1)->symbol; + const char *s2 = ((weighted_item_t *) n2)->symbol; + colitem_t x; + colitem_t *p; + int n; + + /* sort by 1st char ... then inverse for string */ + + x.element = NULL; + if (!is_ucode(s1)) { + x.string = s1; + p = tfind(&x, &cur_base->root_colitem, colitem_cmp); + s1 = (*((colitem_t **) p))->element + 1; + } + if (!is_ucode(s2)) { + x.string = s2; + p = tfind(&x, &cur_base->root_colitem, colitem_cmp); + s2 = (*((colitem_t **) p))->element + 1; + } + + /* < */ + /* 01234567 */ + + assert(is_ucode(s1)); + assert(is_ucode(s2)); + + n = strncmp(s1+2, s2+2, 4); + if (n) { + return n; + } + + s1 += 7; + s2 += 7; + + return strcmp(s2, s1); +} + +static void print_starter_all_node(const void *ptr, VISIT order, int level) +{ + const weighted_item_t *w = *(const weighted_item_t **) ptr; + colitem_t *ci; + void *p; + int n; + colitem_t x; + + if (order == postorder || order == leaf) { +#if 0 + if ((n = is_ucode(w->symbol)) != 0) { + printf(" %s\n", w->symbol); + } else { + x.string = w->symbol; + x.element = NULL; + p = tfind(&x, &cur_base->root_colitem, colitem_cmp); + assert(p); + ci = *((colitem_t **) p); + printf("%s = %s\n", ci->element, w->symbol); + } +#else + printf("%s|", w->symbol); +/* if ((n = is_ucode(w->symbol)) != 0) { */ +/* printf("\n"); */ +/* } */ +#endif + } +} + +static void process_starter_node(const void *ptr, VISIT order, int level) +{ + const weighted_item_t *w = *(const weighted_item_t **) ptr; + colitem_t *ci; + void *p; + int n; + colitem_t x; + const char *s; + char buf[32]; + + /* store index of collation item followed by (unprefixed) nul-terminated string */ + if (order == postorder || order == leaf) { + if ((n = is_ucode(w->symbol)) != 0) { + u16_buf[u16_buf_len++] = final_index_val(w->symbol); + assert(u16_buf[u16_buf_len-1]); + u16_buf[u16_buf_len++] = 0; + if (++u16_starter < base_locale_array[base_locale_len].num_starters) { + u16_buf[u16_starter] = u16_buf_len; + } +/* fprintf(stderr, "ucode - %d %d\n", u16_buf[u16_starter-1], u16_buf_len); */ + } else { + x.string = w->symbol; + x.element = NULL; + p = tfind(&x, &cur_base->root_colitem, colitem_cmp); + assert(p); + ci = *((colitem_t **) p); + s = ci->element; + u16_buf[u16_buf_len++] = final_index_val(w->symbol); + assert(u16_buf[u16_buf_len-1]); + assert(*s == '"'); + n = is_ucode(++s); +/* fprintf(stderr, "s is |%s| with len %d (%d)\n", s, strlen(s), n); */ + assert(n); + s += n; + while (*s != '"') { + n = is_ucode(s); + assert(n); + strncpy(buf, s, n+1); + buf[n] = 0; +/* fprintf(stderr, "buf is |%s| with len %d (%d)\n", buf, strlen(buf), n); */ + u16_buf[u16_buf_len++] = final_index_val(buf); + assert(u16_buf[u16_buf_len-1]); + s += n; + } + u16_buf[u16_buf_len++] = 0; + } + } +} + +static void **p_cl_root_starter_all; + +static void complete_starter_node(const void *ptr, VISIT order, int level) +{ + weighted_item_t w; + weighted_item_t *p; + + if (order == postorder || order == leaf) { + w.symbol = *(const char **) ptr; + w.weight = NULL; + if (!tfind(&w, p_cl_root_starter_all, starter_all_cmp)) { + p = xmalloc(sizeof(weighted_item_t)); + p->symbol = w.symbol; + p->weight = NULL; +/* fprintf(stderr, "complete_starter_node: %s\n", *(const char **) ptr); */ + if (!tsearch(p, p_cl_root_starter_all, starter_all_cmp)) { + error_msg("OUT OF MEMORY"); + } + } + } +} + +static void do_starter_lists(col_locale_t *cl) +{ + ll_item_t *s; + ll_item_t *h; + ll_item_t *lli; + col_locale_t *c; + colitem_t *ci; + weighted_item_t *w; + void *p; + char buf[32]; + int n; + colitem_t x; + void *mm; + + c = cl; + if (c != cur_base) { + c = cur_base; + } + +/* printf("STARTERS %s --------------------\n", cl->name); */ + LOOP: + for (s = c->section_list ; s ; s = s->next) { + h = lli = ((section_t *)(s->data))->itm_list; + if (!lli) { + continue; + } + do { + if (lli->data_type & DT_WEIGHTED) { + w = (weighted_item_t *)(lli->data); + ci = NULL; + if ((n = is_ucode(w->symbol)) != 0) { + strcpy(buf, w->symbol); + } else { +/* fprintf(stdout, "looking for |%s|\n", w->symbol); */ + x.string = w->symbol; + x.element = NULL; + p = tfind(&x, &cur_base->root_colitem, colitem_cmp); + if (!p) { +/* fprintf(stderr, "Whoa... processing starters for %s and couldn't find %s\n", */ +/* cl->name, w->symbol); */ + continue; + } + ci = *((colitem_t **) p); + if (!ci->element) { /* just a collating symbol */ + continue; + } + assert(ci->element[0] == '"'); + n = is_ucode(ci->element + 1); + assert(n); + strncpy(buf, ci->element + 1, n); + } + if ((tfind(buf, &cur_base->root_starter_char, sym_cmp)) != NULL) { +/* fprintf(stdout, "adding from %s: %s", c->name, w->symbol); */ +/* if (ci) { */ +/* fprintf(stdout, " = %s", ci->element); */ +/* } */ +/* fprintf(stdout, "\n"); */ + + if (!tsearch(w, &cl->root_starter_all, starter_all_cmp)) { + error_msg("OUT OF MEMORY"); + } + } + } + } while ((lli = lli->next) != h); + } + + if (c != cl) { + c = cl; + goto LOOP; + } + + p_cl_root_starter_all = &cl->root_starter_all; + twalk(cur_base->root_starter_char, complete_starter_node); + + if (cl == cur_base) { + base_locale_array[base_locale_len].num_starters = tnumnodes(cur_base->root_starter_char); + } + +#if 0 + printf("\nNow walking tree...\n\n"); + twalk(cl->root_starter_all, print_starter_all_node); + printf("\n\n"); + +#endif + u16_starter = 0; + u16_buf[0] = u16_buf_len = base_locale_array[base_locale_len].num_starters; + twalk(cl->root_starter_all, process_starter_node); +/* fprintf(stderr, "s=%d n=%d\n", u16_starter, base_locale_array[base_locale_len].num_starters); */ + assert(u16_starter == base_locale_array[base_locale_len].num_starters); + +#if 0 + { int i; + for (i=0 ; i < u16_buf_len ; i++) { + fprintf(stderr, "starter %2d: %d - %#06x\n", i, u16_buf[i], u16_buf[i]); + }} +#endif + + mm = NULL; + if (u16_buf_len) { +/* assert(base_locale_array[base_locale_len].num_starters); */ + if ((u16_buf_len > multistart_len) || + !(mm = memmem(multistart_buffer, multistart_len*sizeof(multistart_buffer[0]), + u16_buf, u16_buf_len*sizeof(u16_buf[0]))) + ) { + assert(multistart_len + u16_buf_len < MULTISTART_LEN); + memcpy(multistart_buffer + multistart_len, u16_buf, u16_buf_len*sizeof(u16_buf[0])); + if (cl == cur_base) { + base_locale_array[base_locale_len].multistart_offset = multistart_len; + } else { + der_locale_array[der_locale_len].multistart_offset = multistart_len; + } + multistart_len += u16_buf_len; +/* fprintf(stderr, "%s: multistart_len = %d u16_buf_len = %d\n", cl->name, multistart_len, u16_buf_len); */ + } else if (!(u16_buf_len > multistart_len)) { + assert(mm); + if (cl == cur_base) { + base_locale_array[base_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer; + } else { + der_locale_array[der_locale_len].multistart_offset = ((uint16_t *)(mm)) - multistart_buffer; + } +/* fprintf(stderr, "%s: memmem found a match with u16_buf_len = %d\n", cl->name, u16_buf_len); */ + } + } else { + assert(!base_locale_array[base_locale_len].num_starters); + } + +/* printf("u16_buf_len = %d\n", u16_buf_len); */ + +/* printf("STARTERS %s DONE ---------------\n", cl->name); */ +} + + +/* For sorting the blocks of unsigned chars. */ +static size_t nu_val; + +int nu_memcmp(const void *a, const void *b) +{ + return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val * sizeof(tbl_item)); +} + + +size_t newopt(tbl_item *ut, size_t usize, int shift, table_data *tbl) +{ + static int recurse = 0; + tbl_item *ti[RANGE]; /* table index */ + size_t numblocks; + size_t blocksize; + size_t uniq; + size_t i, j; + size_t smallest, t; + tbl_item *ii_save; + int uniqblock[1 << (8*sizeof(tbl_item) - 1)]; + tbl_item uit[RANGE]; + int shift2; + + if (shift > 15) { + return SIZE_MAX; + } + + ii_save = NULL; + blocksize = 1 << shift; + numblocks = usize >> shift; + + /* init table index */ + for (i=j=0 ; i < numblocks ; i++) { + ti[i] = ut + j; + j += blocksize; + } + + /* sort */ + nu_val = blocksize; + qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp); + + uniq = 1; + uit[(ti[0]-ut)/blocksize] = 0; + for (i=1 ; i < numblocks ; i++) { + if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) < 0) { + if (++uniq > (1 << (8*sizeof(tbl_item) - 1))) { + break; + } + uniqblock[uniq - 1] = i; + } +#if 1 + else if (memcmp(ti[i-1], ti[i], blocksize*sizeof(tbl_item)) > 0) { + printf("bad sort %i!\n", i); + abort(); + } +#endif + uit[(ti[i]-ut)/blocksize] = uniq - 1; + } + + smallest = SIZE_MAX; + shift2 = -1; + if (uniq <= (1 << (8*sizeof(tbl_item) - 1))) { + smallest = numblocks + uniq * blocksize; + if (!recurse) { + ++recurse; + for (j=1 ; j < 14 ; j++) { + if ((numblocks >> j) < 2) break; + if (tbl) { + ii_save = tbl->ii; + tbl->ii = NULL; + } + if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) { + t += uniq * blocksize; + } + if (tbl) { + tbl->ii = ii_save; + } + if (smallest >= t) { + shift2 = j; + smallest = t; +/* if (!tbl->ii) { */ +/* printf("ishift %u tshift %u size %u\n", */ +/* shift2, shift, t); */ +/* } */ +/* } else { */ +/* break; */ + } + } + --recurse; + } + } else { + return SIZE_MAX; + } + + if (tbl->ii) { + if (recurse) { + tbl->ii_shift = shift; + tbl->ii_len = numblocks; + memcpy(tbl->ii, uit, numblocks*sizeof(tbl_item)); + tbl->ti = tbl->ii + tbl->ii_len; + tbl->ti_len = uniq * blocksize; + for (i=0 ; i < uniq ; i++) { + memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item)); + } + } else { + ++recurse; +/* printf("setting ishift %u tshift %u\n", shift2, shift); */ + newopt(uit, numblocks, shift2, tbl); + --recurse; + tbl->ti_shift = shift; + tbl->ut_len = uniq * blocksize; + tbl->ut = tbl->ti + tbl->ti_len; + for (i=0 ; i < uniq ; i++) { + memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize*sizeof(tbl_item)); + } + } + } + return smallest; +} + +static const int rule2val[8] = { + -1, + (1 << 14), /* forward */ + (2 << 14), /* position */ + (3 << 14), /* forward,position */ + 0, /* backward */ + -1, + -1, + -1, +}; + + +static int final_index_val_x(const char *s, const char *sym) +{ + int r; + + if (!(r = final_index_val0(s))) { + if (!strcmp(s, "IGNORE")) { + r = 0; + } else if (!strcmp(s, "..") || !strcmp(sym, "RANGE")) { + if (*sym == '.') { + final_index_val(sym); /* make sure it's known */ + } + r = 0x3fff; + } else if (!strcmp(s, ".")) { + r = 0x3ffe; + } else { + error_msg("can't find final index: %s", s); + } + } + return r; +} + +/* store rule2val in 2 high bits and collation index in lower. + * for sort strings, store (offset from base) + max colindex as index. + */ +static unsigned int add_rule(weighted_item_t *wi) +{ + weight_t *w = wi->weight; + int i, j, r, n; + uint16_t rbuf[MAX_COLLATION_WEIGHTS]; + uint16_t ws_buf[32]; + void *mm; + char buf[32]; + const char *s; + const char *e; + + for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) { + rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */ + } + + if (base_locale_array[base_locale_len].num_weights < w->num_weights) { + base_locale_array[base_locale_len].num_weights = w->num_weights; + } + + for (i=0 ; i < w->num_weights ; i++) { + assert(rule2val[(int)(w->rule[i])] >= 0); + assert(w->colitem[i] && *w->colitem[i]); + if (*w->colitem[i] == '"') { /* string... */ + s = w->colitem[i] + 1; + assert(*s == '<'); + n = 0; + do { + e = s; + do { + if (*e == '/') { + e += 2; + continue; + } + } while (*e++ != '>'); + assert(((size_t)(e-s) < sizeof(buf))); + memcpy(buf, s, (size_t)(e-s)); + buf[(size_t)(e-s)] = 0; + + r = final_index_val_x(buf, wi->symbol); + assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0])); + ws_buf[n++] = r | rule2val[(int)(w->rule[i])]; + + s = e; + } while (*s != '"'); + ws_buf[n++] = 0; /* terminator */ + + mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]), + ws_buf, n*sizeof(ws_buf[0])); + + if (!mm) { + assert(weightstr_len + n < WEIGHTSTR_LEN); + memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0])); + mm = weightstr_buffer + weightstr_len; + weightstr_len += n; + } + r = (((uint16_t *)(mm)) - weightstr_buffer) + + base_locale_array[base_locale_len].max_col_index + 2; + assert(r < (1 << 14)); + rbuf[i] = r | rule2val[(int)(w->rule[i])]; + } else { /* item */ + r = final_index_val_x(w->colitem[i], wi->symbol); + rbuf[i] = r | rule2val[(int)(w->rule[i])]; + } + } + + for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) { + if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) { + return i/MAX_COLLATION_WEIGHTS; + } + } + + memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0])); + ruletable_len += MAX_COLLATION_WEIGHTS; + + return (ruletable_len / MAX_COLLATION_WEIGHTS)-1; +} + +static unsigned int add_range_rule(range_item_t *ri) +{ + weight_t *w = ri->weight; + int i, j, r, n; + uint16_t rbuf[MAX_COLLATION_WEIGHTS]; + uint16_t ws_buf[32]; + void *mm; + char buf[32]; + const char *s; + const char *e; + + for (i=0 ; i < MAX_COLLATION_WEIGHTS ; i++) { + rbuf[i] = rule2val[R_FORWARD]; /* set a default to forward-ignore */ + } + + if (base_locale_array[base_locale_len].num_weights < w->num_weights) { + base_locale_array[base_locale_len].num_weights = w->num_weights; + } + + for (i=0 ; i < w->num_weights ; i++) { + assert(rule2val[(int)(w->rule[i])] >= 0); + assert(w->colitem[i] && *w->colitem[i]); + if (*w->colitem[i] == '"') { /* string... */ + s = w->colitem[i] + 1; + assert(*s == '<'); + n = 0; + do { + e = s; + do { + if (*e == '/') { + e += 2; + continue; + } + } while (*e++ != '>'); + assert(((size_t)(e-s) < sizeof(buf))); + memcpy(buf, s, (size_t)(e-s)); + buf[(size_t)(e-s)] = 0; + + r = final_index_val_x(buf, "RANGE"); + assert(n + 1 < sizeof(ws_buf)/sizeof(ws_buf[0])); + ws_buf[n++] = r | rule2val[(int)(w->rule[i])]; + + s = e; + } while (*s != '"'); + ws_buf[n++] = 0; /* terminator */ + + mm = memmem(weightstr_buffer, weightstr_len*sizeof(weightstr_buffer[0]), + ws_buf, n*sizeof(ws_buf[0])); + + if (!mm) { + assert(weightstr_len + n < WEIGHTSTR_LEN); + memcpy(weightstr_buffer + weightstr_len, ws_buf, n*sizeof(ws_buf[0])); + mm = weightstr_buffer + weightstr_len; + weightstr_len += n; + } + r = (((uint16_t *)(mm)) - weightstr_buffer) + + base_locale_array[base_locale_len].max_col_index + 2; + assert(r < (1 << 14)); + rbuf[i] = r | rule2val[(int)(w->rule[i])]; + } else { /* item */ + r = final_index_val_x(w->colitem[i], "RANGE"); + rbuf[i] = r | rule2val[(int)(w->rule[i])]; + } + } + + for (i=0 ; i < ruletable_len ; i += MAX_COLLATION_WEIGHTS) { + if (!memcmp(ruletable_buffer + i, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0]))) { + return i/MAX_COLLATION_WEIGHTS; + } + } + + memcpy(ruletable_buffer + ruletable_len, rbuf, MAX_COLLATION_WEIGHTS*sizeof(ruletable_buffer[0])); + ruletable_len += MAX_COLLATION_WEIGHTS; + + return (ruletable_len / MAX_COLLATION_WEIGHTS)-1; +} + +#define DUMPn(X) fprintf(stderr, "%10d-%-.20s", base_locale_array[n]. ## X, #X); + +static void dump_base_locale(int n) +{ + assert(n < base_locale_len); + + fprintf(stderr, "Base Locale: %s\n", base_locale_array[n].name); + + DUMPn(num_weights); + + DUMPn(ii_shift); + DUMPn(ti_shift); + DUMPn(ii_len); + DUMPn(ti_len); + DUMPn(max_weight); + fprintf(stderr, "\n"); + DUMPn(num_col_base); + DUMPn(max_col_index); + DUMPn(undefined_idx); + DUMPn(range_low); + DUMPn(range_count); + fprintf(stderr, "\n"); + DUMPn(range_base_weight); + DUMPn(num_starters); + + fprintf(stderr, "\n"); + DUMPn(range_rule_offset); + DUMPn(wcs2colidt_offset); + DUMPn(index2weight_offset); + fprintf(stderr, "\n"); + DUMPn(index2ruleidx_offset); + DUMPn(multistart_offset); + fprintf(stderr, "\n"); +} + +#undef DUMPn +#define DUMPn(X) fprintf(stderr, "%10d-%s", der_locale_array[n]. ## X, #X); + +static void dump_der_locale(int n) +{ + assert(n < der_locale_len); + + fprintf(stderr, "Derived Locale: %s (%.12s)", + der_locale_array[n].name, + base_locale_array[der_locale_array[n].base_idx].name); + + + DUMPn(base_idx); + + DUMPn(undefined_idx); + + DUMPn(overrides_offset); + DUMPn(multistart_offset); + + fprintf(stderr, "\n"); +} + + +static unsigned long collate_pos; + +static void dump_u16_array(FILE *fp, uint16_t *u, int len, const char *name) +{ + int i; + + fprintf(fp, "\t/* %8lu %s */\n", collate_pos, name); + for (i=0 ; i < len ; i++) { + if (!(i & 7)) { + fprintf(fp, "\n\t"); + } + fprintf(fp," %#06x,", (unsigned int)(u[i])); + } + fprintf(fp,"\n"); + collate_pos += len; +} + +#define OUT_U16C(X,N) fprintf(fp,"\t%10d, /* %8lu %s */\n", X, collate_pos++, N); + +static void dump_collate(FILE *fp) +{ + int n; + + fprintf(fp, "const uint16_t __locale_collate_tbl[] = {\n"); + + OUT_U16C(base_locale_len, "numbef of base locales"); + OUT_U16C(der_locale_len, "number of derived locales"); + OUT_U16C(MAX_COLLATION_WEIGHTS, "max collation weights"); + OUT_U16C(index2weight_len, "number of index2{weight|ruleidx} elements"); + OUT_U16C(weightstr_len, "number of weightstr elements"); + OUT_U16C(multistart_len, "number of multistart elements"); + OUT_U16C(override_len, "number of override elements"); + OUT_U16C(ruletable_len, "number of ruletable elements"); + +#undef DUMPn +#define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", base_locale_array[n]. ## X, collate_pos++, #X); + for (n=0 ; n < base_locale_len ; n++) { + unsigned wcs2colidt_offset_low = base_locale_array[n].wcs2colidt_offset & 0xffffU; + unsigned wcs2colidt_offset_hi = base_locale_array[n].wcs2colidt_offset >> 16; + fprintf(fp, "\t/* Base Locale %2d: %s */\n", n, base_locale_array[n].name); + DUMPn(num_weights); + DUMPn(num_starters); + DUMPn(ii_shift); + DUMPn(ti_shift); + DUMPn(ii_len); + DUMPn(ti_len); + DUMPn(max_weight); + DUMPn(num_col_base); + DUMPn(max_col_index); + DUMPn(undefined_idx); + DUMPn(range_low); + DUMPn(range_count); + DUMPn(range_base_weight); + DUMPn(range_rule_offset); + DUMPn(index2weight_offset); + DUMPn(index2ruleidx_offset); + DUMPn(multistart_offset); +#undef DUMPn +#define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", X, collate_pos++, #X); + DUMPn(wcs2colidt_offset_low); + DUMPn(wcs2colidt_offset_hi); + } +#undef DUMPn + + + fprintf(fp, "#define COL_IDX_C %5d\n", 0); +#define DUMPn(X) fprintf(fp, "\t%10d, /* %8lu %s */\n", der_locale_array[n]. ## X, collate_pos++, #X); + for (n=0 ; n < der_locale_len ; n++) { + fprintf(fp, "#define COL_IDX_%s %5d\n", der_locale_array[n].name, n+1); + fprintf(fp, "\t/* Derived Locale %4d: %s (%.12s) */\n", + n, der_locale_array[n].name, + base_locale_array[der_locale_array[n].base_idx].name); + DUMPn(base_idx); + DUMPn(undefined_idx); + DUMPn(overrides_offset); + DUMPn(multistart_offset); + } +#undef DUMPn + + fprintf(fp, "\n"); + + dump_u16_array(fp, index2weight_buffer, index2weight_len, "index2weight"); + dump_u16_array(fp, index2ruleidx_buffer, index2ruleidx_len, "index2ruleidx"); + dump_u16_array(fp, multistart_buffer, multistart_len, "multistart"); + dump_u16_array(fp, override_buffer, override_len, "override"); + dump_u16_array(fp, ruletable_buffer, ruletable_len, "ruletable"); + dump_u16_array(fp, weightstr_buffer, weightstr_len, "weightstr"); + dump_u16_array(fp, wcs2colidt_buffer, wcs2colidt_len, "wcs2colidt"); + + + fprintf(fp,"}; /* %8lu */\n", collate_pos); + + fprintf(fp,"#define __lc_collate_data_LEN %d\n\n", collate_pos); +} -- cgit v1.2.3