Geonames 500 example trie, build status, trie graph
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Armin Friedl 2020-10-11 20:51:42 +02:00
parent c6fad6471f
commit d8d8436325
5 changed files with 93 additions and 0 deletions

View file

@ -1,6 +1,10 @@
[![Build Status](https://drone.friedl.net/api/badges/incubator/bytetrie/status.svg)](https://drone.friedl.net/incubator/bytetrie)
# Bytetrie # Bytetrie
A fast, dependency-free, self-compressing trie with radix 256 in pure python. A fast, dependency-free, self-compressing trie with radix 256 in pure python.
![](trie.png)
Bytetrie allows fast prefix search in a large corpus of keys. Each key can be Bytetrie allows fast prefix search in a large corpus of keys. Each key can be
associated with arbitrary data. It features fast lookup times at the cost of associated with arbitrary data. It features fast lookup times at the cost of
expensive insertion. A Bytetrie is best used if it can be pre-filled with data. expensive insertion. A Bytetrie is best used if it can be pre-filled with data.

BIN
examples/cities500.tar.gz Normal file

Binary file not shown.

7
examples/gen.sh Executable file
View file

@ -0,0 +1,7 @@
#!/bin/sh
xwim cities500.tar.gz
mv cities500/cities500.txt .
rmdir cities500
python geonames.py
twopi -Tpng geo_dot.dot -o geo_dot_twopi.png -Groot=root -x

82
examples/geonames.py Normal file
View file

@ -0,0 +1,82 @@
import csv
from bytetrie import ByteTrie
def load_geonames():
t = ByteTrie(multi_value=True)
with open("cities500.txt", "r") as f:
reader = csv.reader(f, delimiter='\t')
for i, row in enumerate(reader):
try:
t.insert(row[1].encode("utf-8"), row[17])
except Exception:
print(f"Error in row {i}")
print(f"Label: '{row[1]}'")
print(f"Type: {type(row[1])}")
raise
return t
def insert(trie):
""" Shall only be used to insert strings """
t = trie
def _insert(*vals):
for val in vals:
t.insert(val.encode('utf-8'), val)
return _insert
def load_simple_trie():
t = ByteTrie()
ins = insert(t)
ins("A")
ins("AA", "AB")
ins("ABCDE")
ins("AACDEF", "AACDEGG", "AACDEH")
return t
# This uses internal representations which are not supposed to
# be used as public API and are subject to change!
from bytetrie.bytetrie import ByteTrie, Node, Root, Child, Terminal
def geonames_to_dot(t: ByteTrie):
dot_buffer = str()
dot_buffer += """strict digraph {
graph [
bgcolor="transparent"
];
edge [
arrowhead="none",
penwidth="0.05",
];
node [
label="",
sep="2"
];
root [shape="circle", width="0.4"]
"""
hue_inc = 1/len(t.root.children)
hue = 0
for child in t.root.children:
dot_buffer += _geonames_node_to_dot(t.root, child, 1, hue)
hue += hue_inc
dot_buffer += "}"
return dot_buffer
def _geonames_node_to_dot(p: Node, n: Child, depth, hue):
db = f'{p.dot_id()} -> {n.dot_id()} [color="{hue},{depth*0.1},85"]\n'
db += f'{n.dot_id()} [color="{hue},{depth*0.1},50", shape="circle", width="0.1"]\n'
for child in n.children:
db += _geonames_node_to_dot(n, child, depth+1, hue)
return db
if __name__ == "__main__":
t = load_geonames()
s = geonames_to_dot(t)
with open("geo_dot.dot", "w") as f:
f.write(s)

BIN
trie.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB