From 92077efb433a876afb71d78f2fe056733967436e Mon Sep 17 00:00:00 2001
From: Armin Friedl <dev@friedl.net>
Date: Sat, 10 Oct 2020 02:40:52 +0200
Subject: [PATCH] Add drone build, polish README

---
 .drone.yml           | 22 ++++++++++++++
 README.md            | 68 +++++++++++++++++++++++++++++++++-----------
 bytetrie/bytetrie.py | 24 +++++++++++-----
 setup.py             |  5 ++--
 4 files changed, 93 insertions(+), 26 deletions(-)
 create mode 100644 .drone.yml

diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 0000000..1627e8b
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,22 @@
+kind: pipeline
+type: docker
+name: default
+
+steps:
+- name: validate
+  image: python:3
+  commands:
+    - pip install mypy
+    - mypy bytetrie/bytetrie.py
+
+- name: publish
+  image: python:3
+  environment:
+    TWINE_USERNAME: __token__
+    TWINE_PASSWORD:
+      from_secret: pypi_test_token
+  commands:
+    - pip install twine setuptools wheel
+    - python setup.py sdist bdist_wheel
+    - twine check dist/*
+    - twine upload --repository testpypi dist/*
diff --git a/README.md b/README.md
index 77e1982..af1f294 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ updates.
 ## Keys
 Keys are byte strings. Therefore, each node in the trie can have up to 256
 children (the radix). Keys do work well with utf-8 and other encodings as long
-as the encoding is consistent and deterministic. That is, a grapheme clusters
-are always encoded to the same byte sequence. Even if the standard allows for
+as the encoding is consistent and deterministic. That is, grapheme clusters
+are always encoded to the same byte sequence -- even if the standard allows for
 ambiguity. Usually that's a non-issue as long as the same encoder is used for
 insertion and lookup.
 
@@ -19,24 +19,24 @@ Since prefix search in unicode strings is one of the most common use-cases of
 bytetrie, a unicode layer on top of bytetrie is [planned](TODO.md).
 
 ## Data
-Bytetrie can associate arbitrary data (python objects) with keys. Data (or
-rather a reference thereof) is kept in-tree. No further processing is done.
+Bytetrie can associate arbitrary python objects with keys. Data (or rather a
+reference thereof) is kept in-tree. No further processing is done.
 
-In addition bytrie allows multi-valued tries. Every key is then associated with
+In addition, bytrie allows multi-valued tries. Every key is then associated with
 a sequence of arbitrary objects.
 
 ## Performance
 Despite being in pure python bytetrie is _fast_. Sifting through the full
 [geonames](http://download.geonames.org/export/dump/) "allCountries" dataset for
-places starting with `Vienna` takes a mere 512µs. That's not even one
-millisecond for searching through 12,041,359 places. For comparison a warmed-up
+places starting with `Vienna` takes a mere 512µs. That's not even a
+millisecond for searching through 12,041,359 places. For comparison, a warmed-up
 ripgrep search through the same dataset takes three orders of magnitude (400ms)
 longer on the same machine.
 
-On the downside building the trie takes about 20 minutes and considerable
-memory. Also the performance is mostly trumped by the time it takes to collect
-terminal nodes. That is, the higher up the trie the search ends (and hence the
-more results the prefix search yields) the longer it takes. There are several
+On the downside, building the trie takes about 20 minutes and considerable
+memory. Also, the performance is mostly trumped by the time it takes to collect
+terminal nodes. The higher up the trie the search ends (and hence the more
+results the prefix search yields) the longer it takes. There are several
 low-hanging fruits left and further performance improvements are in the
 [pipeline](TODO.md).
 
@@ -44,7 +44,46 @@ low-hanging fruits left and further performance improvements are in the
 None. That's the point.
 
 # Getting started
-TODO
+Install bytetrie via [pip](https://pip.pypa.io/en/stable/quickstart/).
+```
+pip install -U bytetrie
+```
+
+The public interface is `ByteTrie` with the two methods `insert` and `find`.
+Find returns a list of `Terminals` from which the `key` and the `value` of the
+node can be retrieved.
+
+```python
+from bytetrie import ByteTrie
+
+t = ByteTrie(multi_value=True)
+t.insert(b"Hallo", "Dutch")
+t.insert(b"Hello", "English")
+t.insert(b"Hug", "Gaelic")
+t.insert(b"Hallo", "German")
+t.insert("Hē".encode("utf-8"), "Hindi")
+t.insert("Halló".encode("utf-8"), "Icelandic")
+t.insert(b"Hej", "Polish")
+t.insert(b"Hei", "Romanian")
+t.insert(b"Hujambo", "Swahili")
+t.insert(b"Hej", "Swedish")
+t.insert(b"Helo", "Welsh")
+
+print("Where to say 'Hi' with 'He'?") 
+print(f"{[(n.key(), n.value()) for n in t.find(b'He')]}")
+
+print("Where to say 'Hi' with 'Ha'?") 
+print(f"{[(n.key().decode('utf-8'), n.value()) for n in t.find(b'Ha')]}")
+
+print("Where to say 'Hi' with 'Hē'?") 
+print(f"Say 'Hi' with utf-8: {[(n.key().decode('utf-8'), n.value()) for n in t.find('Hē'.encode('utf-8'))]}")
+```
+
+# Contribute
+If you want to contribute to `bytetrie` feel free to send patches to
+dev[at]friedl[dot]net. Alternatviely, you can issue a pull request on GitHub
+which will be cherry picked into my tree. If you plan significant long-term
+contributions drop me a mail for access to the incubator repository.
 
 # Github Users
 If you are visiting this repository on GitHub, you are on a mirror of
@@ -53,8 +92,3 @@ with my other GitHub mirrors.
 
 Like with my other incubator projects, once I consider `bytetrie` reasonable
 stable the main tree will move to GitHub.
-
-If you want to contribute to `bytetrie` feel free to send patches to
-dev[at]friedl[dot]net. Alternatviely, you can issue a pull request on GitHub
-which will be cherry picked into my tree. If you plan significant long-term
-contributions drop me a mail for access to the incubator repository.
diff --git a/bytetrie/bytetrie.py b/bytetrie/bytetrie.py
index 221af0a..7a6f280 100644
--- a/bytetrie/bytetrie.py
+++ b/bytetrie/bytetrie.py
@@ -6,7 +6,7 @@ import logging
 log = logging.getLogger(__name__)
 
 class ByteTrie:
-    def __init__(self, multi_value=False):
+    def __init__(self, multi_value:bool=False):
         self.root = Root([])
         self.multi_value = multi_value
 
@@ -63,9 +63,9 @@ class ByteTrie:
         ancestor.put_child(node)
         return terminal
 
-    def find(self, prefix):
+    def find(self, prefix: ByteString) -> Sequence[Terminal]:
         node = self._find(self.root, prefix)
-        return self._get_terminals(node, prefix)
+        return self._get_terminals(node)
 
     def _find(self, node, prefix, collector=""):
         cutoff = node.cut_from(prefix)
@@ -84,15 +84,14 @@ class ByteTrie:
             log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.")
             return self._find(child, cutoff)
 
-    def _get_terminals(self, node, label_builder):
+    def _get_terminals(self, node):
         if not node: return []
 
         collector = []
         if isinstance(node, Terminal):
-            collector.append((node, label_builder))
+            collector.append((node))
         for child in node.children:
-            l = child.extend(label_builder)
-            collector.extend(self._get_terminals(child, l))
+            collector.extend(self._get_terminals(child))
         return collector
 
     def to_dot(self) -> str:
@@ -275,6 +274,17 @@ class Terminal(Child):
             return t
         return cls(child.label, content, child.parent, child.children, multi_value)
 
+    def key(self) -> ByteString:
+        l = bytes(self.label)
+        parent = self.parent
+        while isinstance(parent, Child):
+            l = bytes(parent.label) + l
+            parent = parent.parent
+        return l
+
+    def value(self) -> Any:
+        return self.content
+
     def to_dot(self) -> str:
         s = super().to_dot()
         s += f"{self.dot_id()} [color=blue]\n"
diff --git a/setup.py b/setup.py
index 0de603f..5f269f7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
 import setuptools
 
+
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
-setup(
+setuptools.setup(
     name="bytetrie",
-    version="0.0.1",
+    version="0.0.2",
     url="https://git.friedl.net/incubator/bytetrie",
     license="MIT",
     author="Armin Friedl",