Geonames 500 example trie, build status, trie graph
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Armin Friedl 2020-10-11 20:51:42 +02:00
parent c6fad6471f
commit d8d8436325
5 changed files with 93 additions and 0 deletions

View file

@ -1,6 +1,10 @@
[![Build Status](https://drone.friedl.net/api/badges/incubator/bytetrie/status.svg)](https://drone.friedl.net/incubator/bytetrie)
# Bytetrie
A fast, dependency-free, self-compressing trie with radix 256 in pure python.
![](trie.png)
Bytetrie allows fast prefix search in a large corpus of keys. Each key can be
associated with arbitrary data. It features fast lookup times at the cost of
expensive insertion. A Bytetrie is best used if it can be pre-filled with data.

BIN
examples/cities500.tar.gz Normal file

Binary file not shown.

7
examples/gen.sh Executable file
View file

@ -0,0 +1,7 @@
#!/bin/sh
xwim cities500.tar.gz
mv cities500/cities500.txt .
rmdir cities500
python geonames.py
twopi -Tpng geo_dot.dot -o geo_dot_twopi.png -Groot=root -x

82
examples/geonames.py Normal file
View file

@ -0,0 +1,82 @@
import csv
from bytetrie import ByteTrie
def load_geonames():
t = ByteTrie(multi_value=True)
with open("cities500.txt", "r") as f:
reader = csv.reader(f, delimiter='\t')
for i, row in enumerate(reader):
try:
t.insert(row[1].encode("utf-8"), row[17])
except Exception:
print(f"Error in row {i}")
print(f"Label: '{row[1]}'")
print(f"Type: {type(row[1])}")
raise
return t
def insert(trie):
""" Shall only be used to insert strings """
t = trie
def _insert(*vals):
for val in vals:
t.insert(val.encode('utf-8'), val)
return _insert
def load_simple_trie():
t = ByteTrie()
ins = insert(t)
ins("A")
ins("AA", "AB")
ins("ABCDE")
ins("AACDEF", "AACDEGG", "AACDEH")
return t
# This uses internal representations which are not supposed to
# be used as public API and are subject to change!
from bytetrie.bytetrie import ByteTrie, Node, Root, Child, Terminal
def geonames_to_dot(t: ByteTrie):
dot_buffer = str()
dot_buffer += """strict digraph {
graph [
bgcolor="transparent"
];
edge [
arrowhead="none",
penwidth="0.05",
];
node [
label="",
sep="2"
];
root [shape="circle", width="0.4"]
"""
hue_inc = 1/len(t.root.children)
hue = 0
for child in t.root.children:
dot_buffer += _geonames_node_to_dot(t.root, child, 1, hue)
hue += hue_inc
dot_buffer += "}"
return dot_buffer
def _geonames_node_to_dot(p: Node, n: Child, depth, hue):
db = f'{p.dot_id()} -> {n.dot_id()} [color="{hue},{depth*0.1},85"]\n'
db += f'{n.dot_id()} [color="{hue},{depth*0.1},50", shape="circle", width="0.1"]\n'
for child in n.children:
db += _geonames_node_to_dot(n, child, depth+1, hue)
return db
if __name__ == "__main__":
t = load_geonames()
s = geonames_to_dot(t)
with open("geo_dot.dot", "w") as f:
f.write(s)

BIN
trie.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB