diff --git a/README.md b/README.md index af1f294..2c73847 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ +[![Build Status](https://drone.friedl.net/api/badges/incubator/bytetrie/status.svg)](https://drone.friedl.net/incubator/bytetrie) + # Bytetrie A fast, dependency-free, self-compressing trie with radix 256 in pure python. +![](trie.png) + Bytetrie allows fast prefix search in a large corpus of keys. Each key can be associated with arbitrary data. It features fast lookup times at the cost of expensive insertion. A Bytetrie is best used if it can be pre-filled with data. diff --git a/examples/cities500.tar.gz b/examples/cities500.tar.gz new file mode 100644 index 0000000..fceb439 Binary files /dev/null and b/examples/cities500.tar.gz differ diff --git a/examples/gen.sh b/examples/gen.sh new file mode 100755 index 0000000..84101d8 --- /dev/null +++ b/examples/gen.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +xwim cities500.tar.gz +mv cities500/cities500.txt . +rmdir cities500 +python geonames.py +twopi -Tpng geo_dot.dot -o geo_dot_twopi.png -Groot=root -x diff --git a/examples/geonames.py b/examples/geonames.py new file mode 100644 index 0000000..6243b95 --- /dev/null +++ b/examples/geonames.py @@ -0,0 +1,82 @@ +import csv +from bytetrie import ByteTrie + +def load_geonames(): + t = ByteTrie(multi_value=True) + with open("cities500.txt", "r") as f: + reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(reader): + try: + t.insert(row[1].encode("utf-8"), row[17]) + except Exception: + print(f"Error in row {i}") + print(f"Label: '{row[1]}'") + print(f"Type: {type(row[1])}") + raise + return t + +def insert(trie): + """ Shall only be used to insert strings """ + t = trie + def _insert(*vals): + for val in vals: + t.insert(val.encode('utf-8'), val) + return _insert + +def load_simple_trie(): + t = ByteTrie() + ins = insert(t) + ins("A") + ins("AA", "AB") + ins("ABCDE") + ins("AACDEF", "AACDEGG", "AACDEH") + return t + +# This uses internal representations which are not supposed to +# be used as public API and are subject to change! +from bytetrie.bytetrie import ByteTrie, Node, Root, Child, Terminal +def geonames_to_dot(t: ByteTrie): + dot_buffer = str() + dot_buffer += """strict digraph { + graph [ + bgcolor="transparent" + ]; + + edge [ + arrowhead="none", + penwidth="0.05", + ]; + + node [ + label="", + sep="2" + ]; + + root [shape="circle", width="0.4"] + """ + + hue_inc = 1/len(t.root.children) + hue = 0 + + for child in t.root.children: + dot_buffer += _geonames_node_to_dot(t.root, child, 1, hue) + hue += hue_inc + + dot_buffer += "}" + return dot_buffer + +def _geonames_node_to_dot(p: Node, n: Child, depth, hue): + db = f'{p.dot_id()} -> {n.dot_id()} [color="{hue},{depth*0.1},85"]\n' + db += f'{n.dot_id()} [color="{hue},{depth*0.1},50", shape="circle", width="0.1"]\n' + + for child in n.children: + db += _geonames_node_to_dot(n, child, depth+1, hue) + + return db + + +if __name__ == "__main__": + t = load_geonames() + s = geonames_to_dot(t) + with open("geo_dot.dot", "w") as f: + f.write(s) diff --git a/trie.png b/trie.png new file mode 100644 index 0000000..1e8d586 Binary files /dev/null and b/trie.png differ