From 92077efb433a876afb71d78f2fe056733967436e Mon Sep 17 00:00:00 2001 From: Armin Friedl Date: Sat, 10 Oct 2020 02:40:52 +0200 Subject: [PATCH] Add drone build, polish README --- .drone.yml | 22 ++++++++++++++ README.md | 68 +++++++++++++++++++++++++++++++++----------- bytetrie/bytetrie.py | 24 +++++++++++----- setup.py | 5 ++-- 4 files changed, 93 insertions(+), 26 deletions(-) create mode 100644 .drone.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 0000000..1627e8b --- /dev/null +++ b/.drone.yml @@ -0,0 +1,22 @@ +kind: pipeline +type: docker +name: default + +steps: +- name: validate + image: python:3 + commands: + - pip install mypy + - mypy bytetrie/bytetrie.py + +- name: publish + image: python:3 + environment: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: + from_secret: pypi_test_token + commands: + - pip install twine setuptools wheel + - python setup.py sdist bdist_wheel + - twine check dist/* + - twine upload --repository testpypi dist/* diff --git a/README.md b/README.md index 77e1982..af1f294 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ updates. ## Keys Keys are byte strings. Therefore, each node in the trie can have up to 256 children (the radix). Keys do work well with utf-8 and other encodings as long -as the encoding is consistent and deterministic. That is, a grapheme clusters -are always encoded to the same byte sequence. Even if the standard allows for +as the encoding is consistent and deterministic. That is, grapheme clusters +are always encoded to the same byte sequence -- even if the standard allows for ambiguity. Usually that's a non-issue as long as the same encoder is used for insertion and lookup. @@ -19,24 +19,24 @@ Since prefix search in unicode strings is one of the most common use-cases of bytetrie, a unicode layer on top of bytetrie is [planned](TODO.md). ## Data -Bytetrie can associate arbitrary data (python objects) with keys. Data (or -rather a reference thereof) is kept in-tree. No further processing is done. +Bytetrie can associate arbitrary python objects with keys. Data (or rather a +reference thereof) is kept in-tree. No further processing is done. -In addition bytrie allows multi-valued tries. Every key is then associated with +In addition, bytrie allows multi-valued tries. Every key is then associated with a sequence of arbitrary objects. ## Performance Despite being in pure python bytetrie is _fast_. Sifting through the full [geonames](http://download.geonames.org/export/dump/) "allCountries" dataset for -places starting with `Vienna` takes a mere 512µs. That's not even one -millisecond for searching through 12,041,359 places. For comparison a warmed-up +places starting with `Vienna` takes a mere 512µs. That's not even a +millisecond for searching through 12,041,359 places. For comparison, a warmed-up ripgrep search through the same dataset takes three orders of magnitude (400ms) longer on the same machine. -On the downside building the trie takes about 20 minutes and considerable -memory. Also the performance is mostly trumped by the time it takes to collect -terminal nodes. That is, the higher up the trie the search ends (and hence the -more results the prefix search yields) the longer it takes. There are several +On the downside, building the trie takes about 20 minutes and considerable +memory. Also, the performance is mostly trumped by the time it takes to collect +terminal nodes. The higher up the trie the search ends (and hence the more +results the prefix search yields) the longer it takes. There are several low-hanging fruits left and further performance improvements are in the [pipeline](TODO.md). @@ -44,7 +44,46 @@ low-hanging fruits left and further performance improvements are in the None. That's the point. # Getting started -TODO +Install bytetrie via [pip](https://pip.pypa.io/en/stable/quickstart/). +``` +pip install -U bytetrie +``` + +The public interface is `ByteTrie` with the two methods `insert` and `find`. +Find returns a list of `Terminals` from which the `key` and the `value` of the +node can be retrieved. + +```python +from bytetrie import ByteTrie + +t = ByteTrie(multi_value=True) +t.insert(b"Hallo", "Dutch") +t.insert(b"Hello", "English") +t.insert(b"Hug", "Gaelic") +t.insert(b"Hallo", "German") +t.insert("Hē".encode("utf-8"), "Hindi") +t.insert("Halló".encode("utf-8"), "Icelandic") +t.insert(b"Hej", "Polish") +t.insert(b"Hei", "Romanian") +t.insert(b"Hujambo", "Swahili") +t.insert(b"Hej", "Swedish") +t.insert(b"Helo", "Welsh") + +print("Where to say 'Hi' with 'He'?") +print(f"{[(n.key(), n.value()) for n in t.find(b'He')]}") + +print("Where to say 'Hi' with 'Ha'?") +print(f"{[(n.key().decode('utf-8'), n.value()) for n in t.find(b'Ha')]}") + +print("Where to say 'Hi' with 'Hē'?") +print(f"Say 'Hi' with utf-8: {[(n.key().decode('utf-8'), n.value()) for n in t.find('Hē'.encode('utf-8'))]}") +``` + +# Contribute +If you want to contribute to `bytetrie` feel free to send patches to +dev[at]friedl[dot]net. Alternatviely, you can issue a pull request on GitHub +which will be cherry picked into my tree. If you plan significant long-term +contributions drop me a mail for access to the incubator repository. # Github Users If you are visiting this repository on GitHub, you are on a mirror of @@ -53,8 +92,3 @@ with my other GitHub mirrors. Like with my other incubator projects, once I consider `bytetrie` reasonable stable the main tree will move to GitHub. - -If you want to contribute to `bytetrie` feel free to send patches to -dev[at]friedl[dot]net. Alternatviely, you can issue a pull request on GitHub -which will be cherry picked into my tree. If you plan significant long-term -contributions drop me a mail for access to the incubator repository. diff --git a/bytetrie/bytetrie.py b/bytetrie/bytetrie.py index 221af0a..7a6f280 100644 --- a/bytetrie/bytetrie.py +++ b/bytetrie/bytetrie.py @@ -6,7 +6,7 @@ import logging log = logging.getLogger(__name__) class ByteTrie: - def __init__(self, multi_value=False): + def __init__(self, multi_value:bool=False): self.root = Root([]) self.multi_value = multi_value @@ -63,9 +63,9 @@ class ByteTrie: ancestor.put_child(node) return terminal - def find(self, prefix): + def find(self, prefix: ByteString) -> Sequence[Terminal]: node = self._find(self.root, prefix) - return self._get_terminals(node, prefix) + return self._get_terminals(node) def _find(self, node, prefix, collector=""): cutoff = node.cut_from(prefix) @@ -84,15 +84,14 @@ class ByteTrie: log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.") return self._find(child, cutoff) - def _get_terminals(self, node, label_builder): + def _get_terminals(self, node): if not node: return [] collector = [] if isinstance(node, Terminal): - collector.append((node, label_builder)) + collector.append((node)) for child in node.children: - l = child.extend(label_builder) - collector.extend(self._get_terminals(child, l)) + collector.extend(self._get_terminals(child)) return collector def to_dot(self) -> str: @@ -275,6 +274,17 @@ class Terminal(Child): return t return cls(child.label, content, child.parent, child.children, multi_value) + def key(self) -> ByteString: + l = bytes(self.label) + parent = self.parent + while isinstance(parent, Child): + l = bytes(parent.label) + l + parent = parent.parent + return l + + def value(self) -> Any: + return self.content + def to_dot(self) -> str: s = super().to_dot() s += f"{self.dot_id()} [color=blue]\n" diff --git a/setup.py b/setup.py index 0de603f..5f269f7 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,12 @@ import setuptools + with open("README.md", "r") as fh: long_description = fh.read() -setup( +setuptools.setup( name="bytetrie", - version="0.0.1", + version="0.0.2", url="https://git.friedl.net/incubator/bytetrie", license="MIT", author="Armin Friedl",