diff -urw GeoIP-1.4.4/libGeoIP/GeoIP.c local/libGeoIP/GeoIP.c --- GeoIP-1.4.4/libGeoIP/GeoIP.c 2007-11-24 22:27:25.000000000 -0500 +++ local/libGeoIP/GeoIP.c 2008-03-24 15:38:06.904721103 -0400 @@ -40,6 +40,8 @@ #include /* for fstat */ #include /* for fstat */ +#include /* for gettimeofday() */ + #ifdef HAVE_STDINT_H #include /* For uint32_t */ #endif @@ -321,7 +323,18 @@ static int _check_mtime(GeoIP *gi) { struct stat buf; + struct timeval t; + if (gi->flags & GEOIP_CHECK_CACHE) { + + /* stat only has second granularity, so don't + call it more than once a second */ + gettimeofday (&t,NULL); + if (t.tv_sec == gi->last_mtime_check){ + return 0; + } + gi->last_mtime_check = t.tv_sec; + if (stat(gi->file_path, &buf) != -1) { if (buf.st_mtime != gi->mtime) { /* GeoIP Database file updated */ @@ -386,17 +399,236 @@ return 0; } + + +static void _geoip_destroy_superindex (GeoIP *gi) +{ + int i; + + if (!gi->superindex) return; + + for (i=0; i< 4096; i++) + free (gi->superindex->allocated[i]); + + free (gi->superindex); + gi->superindex = NULL; + return; +} + + +static struct geoip_super_index_node * +super_index_ptr_from_addr (struct superindex_hash *si, uint32_t addr) +{ + addr = addr & 0xffffff; + return si->allocated[addr >> 12] + (addr & 0xfff); +} + +static unsigned int _geoip_superindex_seek_record (GeoIP *gi, unsigned long ipnum) +{ + unsigned long ptrval; + struct geoip_super_index_node *node; + int i; + uint32_t sinval, blockval; + + ptrval = (unsigned long) gi->superindex->direct_hash[ipnum >> 15]; + + if (ptrval == 0) + { + return gi->databaseSegments[0]; + } + else if (ptrval & 1) + { + // direct hit with just 1 access! + gi->netmask = ((ptrval >> 1) & 0x1F); + return ptrval >> 6; + } + + node = (struct geoip_super_index_node *)ptrval; + + for (i =18; /*while (1) */ ; i += 3) + { + sinval = node->block[(ipnum >> (30 - i)) & 0x7]; + blockval = sinval & 0xffffff; + + if (blockval == 0) + return gi->databaseSegments[0]; + + /* if flag is set, then blockval is really a pointer to + a new super index node.. otherwise, it is our return value */ + + if ((sinval & (1 << 24)) == 0) + return blockval; + + node = super_index_ptr_from_addr (gi->superindex, blockval); + } + + return 0; +} + +static void +_update_superindex_hash (GeoIP *gi, int depth, unsigned int netmask, unsigned long ptrval) +{ + if (depth == 17) + { + gi->superindex->direct_hash[netmask >> 15] = (void *)ptrval; + } + else + { /* recursively split a 16 into two 17s or a 15 to two 16s etc */ + _update_superindex_hash (gi, depth+1, netmask, ptrval); + _update_superindex_hash (gi, depth+1, netmask | (1 << (31-depth)), ptrval); + } + return; +} + + +static struct geoip_super_index_node * +allocate_super_index_node (struct superindex_hash *si, uint32_t *newaddr) +{ + uint32_t addr = si->count++; + + if (addr > 0xffffff) + return NULL; + if (newaddr) + *newaddr = addr; + + if ((addr & 0xfff) == 0) + { + posix_memalign ((void **)&(si->allocated[addr >> 12]), + 128, sizeof (struct geoip_super_index_node) * 4096); + memset (si->allocated[addr >> 12], 0, + sizeof (struct geoip_super_index_node) * 4096); + } + return si->allocated[addr >> 12] + (addr & 0xfff); +} + + +static struct geoip_super_index_node * +link_leaf_node(struct superindex_hash *si, struct geoip_super_index_node *cur, int idx) +{ + if (cur->block[idx] == 0) + { + uint32_t newaddr; + struct geoip_super_index_node *tnode; + + tnode = allocate_super_index_node (si, &newaddr); + cur->block[idx] = newaddr | (1 << 24); + return tnode; + } + + return super_index_ptr_from_addr (si, cur->block[idx]); +} + + + +static void +_make_super_index (GeoIP *gi, int offset, int depth, unsigned netmask) +{ + const unsigned char *buf; + + /* + netmask <= accesses cumulative % of db + 17 1 0.34 + 20 2 2.01 + 23 3 12.00 + 26 4 31.31 + 29 5 88.22 + 32 6 100.0 + + median is 28 - so that is 5 acceses, compared to 28 with radix tre. almost 60% + of database served in 5 accesses... any given valid address is 4.7 accesses. + */ + if (offset >= gi->databaseSegments[0]) + { + if (offset > gi->databaseSegments[0]) + { + if (depth <= 17) + { + unsigned long ptrval; + + ptrval = offset << 6; + ptrval |= depth << 1; /* depth is 1-17 - so 5 bits is fine */ + ptrval |= 1; /* indicates this isnt a index node ptr */ + + _update_superindex_hash (gi, depth, netmask, ptrval); + } + else + { + struct geoip_super_index_node *node; + int i; + unsigned int hash; + + hash = netmask >> 15; + node = gi->superindex->direct_hash[hash]; + if (node == NULL) + node = gi->superindex->direct_hash[hash] = + allocate_super_index_node(gi->superindex, NULL); + + for (i=18; i <= depth; i += 3) + { + if (depth >= (i+3)) + { + /* just need to traverse to the next node */ + node = link_leaf_node(gi->superindex, node, + (netmask >> (30 - i)) & 0x7); + } + else if (depth == (i+2)) + { + /* just need to fill in one node */ + node->block[(netmask >> (30 - i)) & 0x7] = offset; + } + else + { + /* just decompose into two more nodes and a longer netmask */ + _make_super_index (gi, offset, depth + 1, netmask); + _make_super_index (gi, offset, depth + 1, netmask | ( 1 << (31-depth))); + } + } + } + } + return; + } + + buf = gi->cache + (long)gi->record_length * 2 * offset; + + /* Take the left-hand branch */ + offset = (buf[3*0 + 0] << (0*8)) + + (buf[3*0 + 1] << (1*8)) + + (buf[3*0 + 2] << (2*8)); + _make_super_index (gi, offset, depth + 1, netmask); + + /* Take the right-hand branch */ + offset = (buf[3*1 + 0] << (0*8)) + + (buf[3*1 + 1] << (1*8)) + + (buf[3*1 + 2] << (2*8)); + _make_super_index (gi, offset, depth + 1, netmask | ( 1 << (31-depth))); + + return; +} + +static +void *_geoip_make_super_index (GeoIP *gi) +{ + if ((!gi) || (!gi->cache) || gi->index_cache) return NULL; /* only for full mem caching */ + gi->superindex = (struct superindex_hash *) malloc (sizeof (struct superindex_hash)); + memset (gi->superindex,0,sizeof (struct superindex_hash)); + + _make_super_index (gi, 0, 0, 0); + return gi->superindex; +} + + unsigned int _GeoIP_seek_record (GeoIP *gi, unsigned long ipnum) { int depth; unsigned int x; unsigned char stack_buffer[2 * MAX_RECORD_LENGTH]; const unsigned char *buf = (gi->cache == NULL) ? stack_buffer : NULL; unsigned int offset = 0; - const unsigned char * p; int j; _check_mtime(gi); + if (gi->superindex) return _geoip_superindex_seek_record (gi, ipnum); + for (depth = 31; depth >= 0; depth--) { if (gi->cache == NULL && gi->index_cache == NULL) { /* read from disk */ @@ -584,6 +816,7 @@ return NULL; } gi->mtime = buf.st_mtime; + gi->last_mtime_check = gi->mtime; } gi->cache = NULL; } @@ -606,6 +839,8 @@ } else { gi->index_cache = NULL; } + + _geoip_make_super_index (gi); /* nop if full mem cache not on */ return gi; } } @@ -629,6 +864,9 @@ free(gi->file_path); if (gi->databaseSegments != NULL) free(gi->databaseSegments); + + _geoip_destroy_superindex (gi); + free(gi); } diff -urw GeoIP-1.4.4/libGeoIP/GeoIPCity.c local/libGeoIP/GeoIPCity.c --- GeoIP-1.4.4/libGeoIP/GeoIPCity.c 2007-07-23 23:48:50.000000000 -0400 +++ local/libGeoIP/GeoIPCity.c 2008-03-21 17:38:35.517690063 -0400 @@ -36,22 +36,28 @@ static const int FULL_RECORD_LENGTH = 50; +static int _iso_8859_1__utf8(const char * iso, char *dest); + static -GeoIPRecord * _extract_record(GeoIP* gi, unsigned int seek_record, int *next_record_ptr) { +GeoIPRecord * _extract_record(GeoIP* gi, unsigned int seek_record, int *next_record_ptr, GeoIPRecord *dst) { int record_pointer; - unsigned char *record_buf = NULL; + unsigned char *record_buf; unsigned char *begin_record_buf = NULL; GeoIPRecord * record; - int str_length = 0; int j; double latitude = 0, longitude = 0; - int dmaarea_combo = 0; - int bytes_read = 0; + int dmaarea_combo ; + int bytes_read; + char *dataspace; + int country_code; + div_t d; + if (seek_record == gi->databaseSegments[0]) return NULL; - record = malloc(sizeof(GeoIPRecord)); - memset(record, 0, sizeof(GeoIPRecord)); + record = dst ? dst : malloc(sizeof(GeoIPRecord)); + record->record_allocated_at = dst; + dataspace = record->dataspace; record->charset = gi->charset; @@ -72,44 +78,41 @@ } /* get country */ - record->continent_code = (char *) GeoIP_country_continent[record_buf[0]]; - record->country_code = (char *) GeoIP_country_code [record_buf[0]]; - record->country_code3 = (char *) GeoIP_country_code3[record_buf[0]]; - record->country_name = (char *) GeoIP_country_name [record_buf[0]]; + country_code = record_buf[0]; + record->continent_code = (char *) GeoIP_country_continent[country_code]; + record->country_code = (char *) GeoIP_country_code [country_code]; + record->country_code3 = (char *) GeoIP_country_code3[country_code]; + record->country_name = (char *) GeoIP_country_name [country_code]; record_buf++; /* get region */ - while (record_buf[str_length] != '\0') - str_length++; - if (str_length > 0) { - record->region = malloc(str_length+1); - strncpy(record->region, record_buf, str_length+1); - } - record_buf += str_length + 1; - str_length = 0; + for (record->region = dataspace; (*dataspace = *record_buf); dataspace++, record_buf++); + dataspace++; + record_buf++; + if (*(record->region) == 0) + record->region = NULL; /* get city */ - while (record_buf[str_length] != '\0') - str_length++; - if (str_length > 0) { - if ( gi->charset == GEOIP_CHARSET_UTF8 ) { - record->city = _iso_8859_1__utf8( (const char * ) record_buf ); - } else { - record->city = malloc(str_length+1); - strncpy(record->city, ( const char * ) record_buf, str_length+1); - } + if (gi->charset == GEOIP_CHARSET_UTF8) + { + record->city = dataspace; + dataspace += _iso_8859_1__utf8( (const char * ) record_buf , dataspace); + } + else + { + for (record->city = dataspace;(*dataspace = *record_buf); dataspace++, record_buf++); + dataspace++; + record_buf++; } - record_buf += (str_length + 1); - str_length = 0; + if (*(record->city) == 0) + record->city = NULL; /* get postal code */ - while (record_buf[str_length] != '\0') - str_length++; - if (str_length > 0) { - record->postal_code = malloc(str_length+1); - strncpy(record->postal_code, record_buf, str_length+1); - } - record_buf += (str_length + 1); + for (record->postal_code = dataspace; (*dataspace = *record_buf); dataspace++, record_buf++); + dataspace++; + record_buf++; + if (*(record->postal_code) == 0) + record->postal_code = NULL; /* get latitude */ for (j = 0; j < 3; ++j) @@ -123,14 +126,20 @@ record->longitude = longitude/10000 - 180; /* get area code and dma code for post April 2002 databases and for US locations */ - if (GEOIP_CITY_EDITION_REV1 == gi->databaseType) { - if (!strcmp(record->country_code, "US")) { + if ( (country_code == 225) && + (GEOIP_CITY_EDITION_REV1 == gi->databaseType)) + { record_buf += 3; + dmaarea_combo = 0; for (j = 0; j < 3; ++j) dmaarea_combo += (record_buf[j] << (j * 8)); - record->dma_code = dmaarea_combo/1000; - record->area_code = dmaarea_combo % 1000; - } + d = div (dmaarea_combo, 1000); + record->dma_code = d.quot; + record->area_code = d.rem; + } + else + { + record->dma_code = record->area_code = 0; } if (gi->cache == NULL) @@ -144,7 +153,7 @@ } static -GeoIPRecord * _get_record(GeoIP* gi, unsigned long ipnum) { +GeoIPRecord * _get_record(GeoIP* gi, unsigned long ipnum, GeoIPRecord *dst) { unsigned int seek_record; if (gi->databaseType != GEOIP_CITY_EDITION_REV0 && @@ -154,11 +163,11 @@ } seek_record = _GeoIP_seek_record(gi, ipnum); - return _extract_record(gi, seek_record, NULL); + return _extract_record(gi, seek_record, NULL, dst); } GeoIPRecord * GeoIP_record_by_ipnum (GeoIP* gi, unsigned long ipnum) { - return _get_record(gi, ipnum); + return _get_record(gi, ipnum, NULL); } GeoIPRecord * GeoIP_record_by_addr (GeoIP* gi, const char *addr) { @@ -167,7 +176,17 @@ return 0; } ipnum = _GeoIP_addr_to_num(addr); - return _get_record(gi, ipnum); + return _get_record(gi, ipnum, NULL); +} + +int GeoIP_record_by_addr_s (GeoIP* gi, const char *addr, GeoIPRecord *dst) { + unsigned long ipnum; + + if (addr == NULL) + return -1; + + ipnum = _GeoIP_addr_to_num(addr); + return _get_record(gi, ipnum, dst) ? 0 : -1; } GeoIPRecord * GeoIP_record_by_name (GeoIP* gi, const char *name) { @@ -176,7 +195,7 @@ return 0; } ipnum = _GeoIP_lookupaddress(name); - return _get_record(gi, ipnum); + return _get_record(gi, ipnum, NULL); } int GeoIP_record_id_by_addr (GeoIP* gi, const char *addr) { @@ -202,30 +221,31 @@ printf("GeoIP_next_record not supported in memory cache mode\n"); return 1; } - *gir = _extract_record(gi, *record_iter, record_iter); + *gir = _extract_record(gi, *record_iter, record_iter, NULL); return 0; } void GeoIPRecord_delete (GeoIPRecord *gir) { - free(gir->region); - free(gir->city); - free(gir->postal_code); + if (!gir->record_allocated_at) free(gir); -} + return; +} -char * _iso_8859_1__utf8(const char * iso) { +static +int _iso_8859_1__utf8(const char * iso, char *dest) { char c, k; char * p; - char * t = iso; + char * t = (char *)iso; int len = 0; while ( ( c = *t++) ){ if ( c < 0 ) len++; } len += t - iso; - t = p = malloc( len ); + + t = p = dest; if ( p ){ while ( ( c = *iso++ ) ) { @@ -240,5 +260,5 @@ } *t++ = 0x00; } - return p; + return t-p; } diff -urw GeoIP-1.4.4/libGeoIP/GeoIPCity.h local/libGeoIP/GeoIPCity.h --- GeoIP-1.4.4/libGeoIP/GeoIPCity.h 2007-08-15 19:55:45.000000000 -0400 +++ local/libGeoIP/GeoIPCity.h 2008-03-14 12:22:53.654261013 -0400 @@ -40,12 +40,18 @@ int area_code; int charset; char *continent_code; + void *record_allocated_at; + char dataspace[128]; } GeoIPRecord; GeoIPRecord * GeoIP_record_by_ipnum (GeoIP* gi, unsigned long ipnum); GeoIPRecord * GeoIP_record_by_addr (GeoIP* gi, const char *addr); GeoIPRecord * GeoIP_record_by_name (GeoIP* gi, const char *host); +/* return 0 on success, -1 on failure */ +int GeoIP_record_by_addr_s (GeoIP* gi, const char *addr, GeoIPRecord *dst); + + int GeoIP_record_id_by_addr (GeoIP* gi, const char *addr); int GeoIP_init_record_iter (GeoIP* gi); /* returns 0 on success, 1 on failure */ @@ -53,8 +59,6 @@ void GeoIPRecord_delete (GeoIPRecord *gir); -/* NULL on failure otherwise a malloced string in utf8 */ -char * _iso_8859_1__utf8(const char *); #ifdef __cplusplus } Only in local/libGeoIP: GeoIP.c.orig Only in local/libGeoIP: GeoIP.c.rej diff -urw GeoIP-1.4.4/libGeoIP/GeoIP.h local/libGeoIP/GeoIP.h --- GeoIP-1.4.4/libGeoIP/GeoIP.h 2007-11-24 22:32:36.000000000 -0500 +++ local/libGeoIP/GeoIP.h 2008-03-24 15:36:53.464535977 -0400 @@ -37,14 +37,48 @@ #define MAX_RECORD_LENGTH 4 #define NUM_DB_TYPES 20 +#include + + +struct geoip_super_index_node +{ + /* + this represents 3 levels of a netmask, the 3 bits of interest are + used to directly index into the block array. If less than 3 bits are available + then it needs to be expanded into all its possible incarnations + + the data in the block is either an offest to the db memory cache, or + it is a pointer to another super index node. In each case the value is 24 bits.. + (1<<24) is used as a flag to multiplex between pointer or offset. One of these + short pointers can be converted to a C pointer with super_index_ptr_from_addr() + */ + + uint32_t block[8]; +}; + +struct superindex_hash +{ + /* the direct hash is used to hash the first 17 bits of a netmask into either an offset to the db + or a pointer to a super index node. */ + void *direct_hash[131702]; + + /*the super_index_nodes work better if they are aligned on cachelines, but doing so for each one would + be a lot of overhead.. so we hand them out 4096 at a time */ + struct geoip_super_index_node *allocated[4096]; + uint32_t count; +}; + + typedef struct GeoIPTag { FILE *GeoIPDatabase; char *file_path; unsigned char *cache; unsigned char *index_cache; + struct superindex_hash *superindex; unsigned int *databaseSegments; char databaseType; time_t mtime; + time_t last_mtime_check; int flags; off_t size; char record_length; @@ -53,6 +87,16 @@ int netmask; /* netmask of last lookup - set using depth in _GeoIP_seek_record */ } GeoIP; + /* if superindex is not null, it is a an array of 128K void *'s to direct hash + the first 17 bits of each address + + if (addr & 1) then it isn't a ptr, rather it is a block address hit - the + address had a netmask of /17 or shorter (about 0.34% of the database). In this case + the next 5 low bits are the length of the netmask, and the remaining bits are the block offset + .. with 32 bit ptrs, that's 26 bits - enough for 6.7M blocks. a 64 bit build can + support a lot more with the same on disk format + */ + typedef enum { GEOIP_CHARSET_ISO_8859_1 = 0,