Description
Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.
Current Implementation
// src/globals.c - Current linear search
symbol_t *find_symbol(char *name) {
for (symbol_t *sym = symbols; sym; sym = sym->next) {
if (strcmp(sym->name, name) == 0)
return sym;
}
return NULL;
}
Performance profile shows:
- 10,000 symbols: ~350ms lookup time (35% of total)
- 1,000 symbols: ~35ms lookup time (20% of total)
- 100 symbols: ~3ms lookup time (5% of total)
Proposed Hash Table Implementation
1. Data Structures
// Add to src/defs.h
#define SYMBOL_HASH_SIZE 1021 // Prime number for better distribution
typedef struct symbol_entry {
symbol_t *symbol;
struct symbol_entry *next; // Chain for collision resolution
} symbol_entry_t;
typedef struct {
symbol_entry_t *buckets[SYMBOL_HASH_SIZE];
int total_symbols;
int max_chain_length; // For statistics
// Statistics
int lookups;
int collisions;
int hits;
int misses;
} symbol_table_t;
// Global symbol tables
typedef struct {
symbol_table_t *global_symbols;
symbol_table_t *local_symbols; // Current function scope
symbol_table_t *types; // Type definitions
} symbol_context_t;
2. Hash Function
// src/symbol_table.c - New file
// DJB2 hash function - simple and effective
uint32_t hash_string(const char *str) {
uint32_t hash = 5381;
int c;
while ((c = *str++)) {
hash = ((hash << 5) + hash) + c; // hash * 33 + c
}
return hash;
}
int get_bucket_index(const char *name) {
return hash_string(name) % SYMBOL_HASH_SIZE;
}
3. Symbol Table Operations
symbol_table_t *create_symbol_table(void) {
symbol_table_t *table = calloc(1, sizeof(symbol_table_t));
return table;
}
void insert_symbol(symbol_table_t *table, symbol_t *symbol) {
int index = get_bucket_index(symbol->name);
// Create new entry
symbol_entry_t *entry = malloc(sizeof(symbol_entry_t));
entry->symbol = symbol;
// Insert at head of chain (O(1) insertion)
entry->next = table->buckets[index];
table->buckets[index] = entry;
// Update statistics
table->total_symbols++;
if (entry->next) {
table->collisions++;
}
// Track max chain length
int chain_len = 0;
for (symbol_entry_t *e = entry; e; e = e->next) {
chain_len++;
}
if (chain_len > table->max_chain_length) {
table->max_chain_length = chain_len;
}
}
symbol_t *lookup_symbol(symbol_table_t *table, const char *name) {
table->lookups++;
int index = get_bucket_index(name);
symbol_entry_t *entry = table->buckets[index];
// Search chain
while (entry) {
if (strcmp(entry->symbol->name, name) == 0) {
table->hits++;
return entry->symbol;
}
entry = entry->next;
}
table->misses++;
return NULL;
}
// Replace global find_symbol
symbol_t *find_symbol_fast(const char *name) {
symbol_t *sym = NULL;
// Check local scope first
if (current_context->local_symbols) {
sym = lookup_symbol(current_context->local_symbols, name);
if (sym) return sym;
}
// Check global scope
return lookup_symbol(current_context->global_symbols, name);
}
4. Dynamic Resizing (Optional Enhancement)
void resize_symbol_table(symbol_table_t *table) {
// Only resize if load factor > 0.75
float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE;
if (load_factor < 0.75) return;
// Save old buckets
symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE];
memcpy(old_buckets, table->buckets, sizeof(old_buckets));
// Clear table
memset(table->buckets, 0, sizeof(table->buckets));
table->total_symbols = 0;
table->collisions = 0;
// Rehash all symbols with new size
for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
symbol_entry_t *entry = old_buckets[i];
while (entry) {
symbol_entry_t *next = entry->next;
entry->next = NULL;
// Reinsert with new hash
int new_index = get_bucket_index(entry->symbol->name);
entry->next = table->buckets[new_index];
table->buckets[new_index] = entry;
table->total_symbols++;
entry = next;
}
}
}
5. Integration Points
// Update src/parser.c
void enter_scope(void) {
// Create new local symbol table
current_context->local_symbols = create_symbol_table();
}
void exit_scope(void) {
// Print statistics if verbose
if (verbose_mode) {
print_symbol_table_stats(current_context->local_symbols);
}
// Free local symbol table
free_symbol_table(current_context->local_symbols);
current_context->local_symbols = NULL;
}
// Update symbol creation
symbol_t *create_symbol(const char *name, type_t *type) {
symbol_t *sym = malloc(sizeof(symbol_t));
sym->name = strdup(name);
sym->type = type;
// Insert into appropriate table
if (current_scope == SCOPE_GLOBAL) {
insert_symbol(current_context->global_symbols, sym);
} else {
insert_symbol(current_context->local_symbols, sym);
}
return sym;
}
6. Performance Monitoring
void print_symbol_table_stats(symbol_table_t *table) {
printf("=== Symbol Table Statistics ===\n");
printf("Total symbols: %d\n", table->total_symbols);
printf("Hash buckets: %d\n", SYMBOL_HASH_SIZE);
printf("Load factor: %.2f\n",
(float)table->total_symbols / SYMBOL_HASH_SIZE);
printf("Collisions: %d (%.1f%%)\n",
table->collisions,
table->collisions * 100.0 / table->total_symbols);
printf("Max chain length: %d\n", table->max_chain_length);
printf("Total lookups: %d\n", table->lookups);
printf("Hit rate: %.1f%%\n",
table->hits * 100.0 / table->lookups);
// Distribution analysis
int empty = 0, chains = 0;
for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
if (!table->buckets[i]) {
empty++;
} else if (table->buckets[i]->next) {
chains++;
}
}
printf("Empty buckets: %d (%.1f%%)\n",
empty, empty * 100.0 / SYMBOL_HASH_SIZE);
printf("Chains: %d\n", chains);
}
Testing
Performance Benchmark
// tests/perf/symbol_lookup.c
#define NUM_SYMBOLS 10000
void benchmark_symbol_lookup() {
clock_t start, end;
// Create many symbols
for (int i = 0; i < NUM_SYMBOLS; i++) {
char name[32];
sprintf(name, "symbol_%d", i);
create_symbol(name, &type_int);
}
// Benchmark lookups
start = clock();
for (int i = 0; i < NUM_SYMBOLS * 10; i++) {
char name[32];
sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS);
find_symbol_fast(name);
}
end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("100,000 lookups in %.3f seconds\n", time_spent);
printf("Average: %.3f microseconds per lookup\n",
time_spent * 1000000 / (NUM_SYMBOLS * 10));
}
Correctness Tests
// Verify hash table behaves identically to linear search
void test_symbol_table_correctness() {
// Test insertion and lookup
symbol_t *s1 = create_symbol("test1", &type_int);
symbol_t *s2 = create_symbol("test2", &type_char);
assert(find_symbol_fast("test1") == s1);
assert(find_symbol_fast("test2") == s2);
assert(find_symbol_fast("nonexistent") == NULL);
// Test collision handling
// Force collisions by creating symbols with same hash
for (int i = 0; i < 100; i++) {
char name[32];
sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
create_symbol(name, &type_int);
}
// Verify all can be found
for (int i = 0; i < 100; i++) {
char name[32];
sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
assert(find_symbol_fast(name) != NULL);
}
}
Migration Plan
- Phase 1: Implement hash table alongside existing code
- Phase 2: Add compatibility layer with fallback
- Phase 3: Migrate all symbol lookups
- Phase 4: Remove old linear search code
Success Criteria
- O(1) average lookup time
- No functionality regression
- 30%+ speedup for large programs
- Hash collision rate < 10%
- All existing tests pass
Description
Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.
Current Implementation
Performance profile shows:
Proposed Hash Table Implementation
1. Data Structures
2. Hash Function
3. Symbol Table Operations
4. Dynamic Resizing (Optional Enhancement)
5. Integration Points
6. Performance Monitoring
Testing
Performance Benchmark
Correctness Tests
Migration Plan
Success Criteria