Geonames 500 example trie, build status, trie graph
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
c6fad6471f
commit
d8d8436325
5 changed files with 93 additions and 0 deletions
|
@ -1,6 +1,10 @@
|
||||||
|
[![Build Status](https://drone.friedl.net/api/badges/incubator/bytetrie/status.svg)](https://drone.friedl.net/incubator/bytetrie)
|
||||||
|
|
||||||
# Bytetrie
|
# Bytetrie
|
||||||
A fast, dependency-free, self-compressing trie with radix 256 in pure python.
|
A fast, dependency-free, self-compressing trie with radix 256 in pure python.
|
||||||
|
|
||||||
|
![](trie.png)
|
||||||
|
|
||||||
Bytetrie allows fast prefix search in a large corpus of keys. Each key can be
|
Bytetrie allows fast prefix search in a large corpus of keys. Each key can be
|
||||||
associated with arbitrary data. It features fast lookup times at the cost of
|
associated with arbitrary data. It features fast lookup times at the cost of
|
||||||
expensive insertion. A Bytetrie is best used if it can be pre-filled with data.
|
expensive insertion. A Bytetrie is best used if it can be pre-filled with data.
|
||||||
|
|
BIN
examples/cities500.tar.gz
Normal file
BIN
examples/cities500.tar.gz
Normal file
Binary file not shown.
7
examples/gen.sh
Executable file
7
examples/gen.sh
Executable file
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
xwim cities500.tar.gz
|
||||||
|
mv cities500/cities500.txt .
|
||||||
|
rmdir cities500
|
||||||
|
python geonames.py
|
||||||
|
twopi -Tpng geo_dot.dot -o geo_dot_twopi.png -Groot=root -x
|
82
examples/geonames.py
Normal file
82
examples/geonames.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
import csv
|
||||||
|
from bytetrie import ByteTrie
|
||||||
|
|
||||||
|
def load_geonames():
|
||||||
|
t = ByteTrie(multi_value=True)
|
||||||
|
with open("cities500.txt", "r") as f:
|
||||||
|
reader = csv.reader(f, delimiter='\t')
|
||||||
|
for i, row in enumerate(reader):
|
||||||
|
try:
|
||||||
|
t.insert(row[1].encode("utf-8"), row[17])
|
||||||
|
except Exception:
|
||||||
|
print(f"Error in row {i}")
|
||||||
|
print(f"Label: '{row[1]}'")
|
||||||
|
print(f"Type: {type(row[1])}")
|
||||||
|
raise
|
||||||
|
return t
|
||||||
|
|
||||||
|
def insert(trie):
|
||||||
|
""" Shall only be used to insert strings """
|
||||||
|
t = trie
|
||||||
|
def _insert(*vals):
|
||||||
|
for val in vals:
|
||||||
|
t.insert(val.encode('utf-8'), val)
|
||||||
|
return _insert
|
||||||
|
|
||||||
|
def load_simple_trie():
|
||||||
|
t = ByteTrie()
|
||||||
|
ins = insert(t)
|
||||||
|
ins("A")
|
||||||
|
ins("AA", "AB")
|
||||||
|
ins("ABCDE")
|
||||||
|
ins("AACDEF", "AACDEGG", "AACDEH")
|
||||||
|
return t
|
||||||
|
|
||||||
|
# This uses internal representations which are not supposed to
|
||||||
|
# be used as public API and are subject to change!
|
||||||
|
from bytetrie.bytetrie import ByteTrie, Node, Root, Child, Terminal
|
||||||
|
def geonames_to_dot(t: ByteTrie):
|
||||||
|
dot_buffer = str()
|
||||||
|
dot_buffer += """strict digraph {
|
||||||
|
graph [
|
||||||
|
bgcolor="transparent"
|
||||||
|
];
|
||||||
|
|
||||||
|
edge [
|
||||||
|
arrowhead="none",
|
||||||
|
penwidth="0.05",
|
||||||
|
];
|
||||||
|
|
||||||
|
node [
|
||||||
|
label="",
|
||||||
|
sep="2"
|
||||||
|
];
|
||||||
|
|
||||||
|
root [shape="circle", width="0.4"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
hue_inc = 1/len(t.root.children)
|
||||||
|
hue = 0
|
||||||
|
|
||||||
|
for child in t.root.children:
|
||||||
|
dot_buffer += _geonames_node_to_dot(t.root, child, 1, hue)
|
||||||
|
hue += hue_inc
|
||||||
|
|
||||||
|
dot_buffer += "}"
|
||||||
|
return dot_buffer
|
||||||
|
|
||||||
|
def _geonames_node_to_dot(p: Node, n: Child, depth, hue):
|
||||||
|
db = f'{p.dot_id()} -> {n.dot_id()} [color="{hue},{depth*0.1},85"]\n'
|
||||||
|
db += f'{n.dot_id()} [color="{hue},{depth*0.1},50", shape="circle", width="0.1"]\n'
|
||||||
|
|
||||||
|
for child in n.children:
|
||||||
|
db += _geonames_node_to_dot(n, child, depth+1, hue)
|
||||||
|
|
||||||
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
t = load_geonames()
|
||||||
|
s = geonames_to_dot(t)
|
||||||
|
with open("geo_dot.dot", "w") as f:
|
||||||
|
f.write(s)
|
BIN
trie.png
Normal file
BIN
trie.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 MiB |
Loading…
Reference in a new issue