From 4c88eb41217118b44256fa07ab8c4a70c5034162 Mon Sep 17 00:00:00 2001
From: Armin Friedl <dev@friedl.net>
Date: Mon, 26 Oct 2020 11:47:07 +0100
Subject: [PATCH] Add timezone api

---
 timezone/__init__.py                |  11 +
 timezone/api.py                     |  15 ++
 timezone/data/search.py             |   1 +
 timezone/data/util/patricia_trie.py | 147 ++++++++++++++
 timezone/data/util/preprocessor.py  |  32 +++
 timezone/data/util/test.py          |   7 +
 timezone/search/__init__.py         |   0
 timezone/search/trie.py             | 300 ++++++++++++++++++++++++++++
 timezone/timezone.py                |  29 +++
 9 files changed, 542 insertions(+)
 create mode 100644 timezone/__init__.py
 create mode 100644 timezone/api.py
 create mode 100644 timezone/data/search.py
 create mode 100644 timezone/data/util/patricia_trie.py
 create mode 100644 timezone/data/util/preprocessor.py
 create mode 100644 timezone/data/util/test.py
 create mode 100644 timezone/search/__init__.py
 create mode 100644 timezone/search/trie.py
 create mode 100644 timezone/timezone.py

diff --git a/timezone/__init__.py b/timezone/__init__.py
new file mode 100644
index 0000000..ac918b4
--- /dev/null
+++ b/timezone/__init__.py
@@ -0,0 +1,11 @@
+""" Timezone management
+
+Provides:
+- `search.py`: Fast prefix search for timezones
+- `timezone.py`: Conversion functions
+- `api.py`: A REST API for the functionality provided by this package
+"""
+from flask import Blueprint
+
+app = Blueprint('timezone', __name__, template_folder='templates')
+from . import api
diff --git a/timezone/api.py b/timezone/api.py
new file mode 100644
index 0000000..593f192
--- /dev/null
+++ b/timezone/api.py
@@ -0,0 +1,15 @@
+from flask import request, jsonify
+from time import time
+
+import uuid
+import struct
+
+from . import app
+
+@app.route('/api/v1/autocomplete', methods=['GET'])
+def autocomplete_timezone():
+    complete_str = request.args.get('complete')
+    if not complete_str: return "No part"
+
+    return complete_str
+
diff --git a/timezone/data/search.py b/timezone/data/search.py
new file mode 100644
index 0000000..a252538
--- /dev/null
+++ b/timezone/data/search.py
@@ -0,0 +1 @@
+def suggest(prefix):
diff --git a/timezone/data/util/patricia_trie.py b/timezone/data/util/patricia_trie.py
new file mode 100644
index 0000000..32e781e
--- /dev/null
+++ b/timezone/data/util/patricia_trie.py
@@ -0,0 +1,147 @@
+import logging
+
+class PatriciaTrie:
+    def __init__(self):
+        self.root = Node()
+
+    def find(self, prefix, node=None, collector=""):
+        if not node: return self.find(prefix, self.root)
+
+        logging.debug(f"Looking for prefix {prefix} at {node.elem}")
+        if not prefix:
+            res = []
+            if node.leaf:
+                logging.debug(f"Found leaf {node.elem}")
+                res.append(collector)
+            for child in node.children:
+                logging.debug(f"Looking for leafs in {node.elem}")
+                res.extend(self._find(prefix, child, collector+child.elem))
+            logging.debug(f"Result for {node.elem}: {res}")
+            return res
+
+        for child in node.children:
+            if prefix.startswith(child.elem):
+                return self._find(prefix[len(child.elem):], child, collector+child.elem)
+
+        return []
+
+    def add(self, elem):
+        (node, split_idx, elem_rest) = self.find_longest_match(elem, self.root)
+
+        def new_child():
+            return Node(elem=elem_rest, parent=node, leaf=True, children=[])
+
+        def split_node(leaf):
+            (oelem, ochild, oleaf) = (node.elem, node.children, node.leaf)
+            node.leaf = leaf
+            node.elem = oelem[:split_idx]
+            node.children = []
+
+            node.children.append(Node(elem=oelem[split_idx:], parent=node, leaf=oleaf, children=ochild))
+
+        # elem already found in trie
+        # just make sure node is marked as leaf
+        if not split_idx and not elem_rest:
+            node.leaf = True
+            return
+
+        # - elem not in trie
+        # - parent node exhausted
+        # This can happen if parent is root, or elem is larger than
+        # largest matching elem in trie so far
+        if not split_idx:
+            node.children.append(Node(elem=elem_rest, parent=node, leaf=True, children=[]))
+            return
+
+        # - elem already found in trie
+        # - elem ends in the middle of a node
+        # This can happen if an existing node up to index and its
+        # parents make up the entire elem. We need to split
+        # the node at split_idx and mark it as leaf.
+        if not elem_rest:
+            old_elem = node.elem
+            old_children = node.children
+            old_leaf = node.leaf
+
+            node.leaf = True
+            node.elem = old_elem[:split_idx]
+            node.children = []
+
+            split_node = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
+
+            node.children.append(split_node)
+            return
+
+
+        # - elem not found in trie
+        # - node up to split_idx and its parent make up elem
+        # Node needs to be split at split_idx (preserving leaf status for split off old node) and
+        # a new child is added for elem
+        old_children = node.children
+        old_leaf = node.leaf
+
+        node.leaf = False
+        node.elem = old_elem[:split_idx]
+        node.children = []
+
+        node_a = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
+        node_b = Node(elem=elem_rest, parent=node, leaf=True, children=[])
+        node.children.append(node_a)
+        node.children.append(node_b)
+
+    def find_longest_match(self, elem, node):
+        for child in node.children:
+            if not child.elem or not elem: continue
+
+            # child does not match
+            if child.elem[0] is not elem[0]: continue
+
+            # child matches completely
+            if elem.startswith(child.elem):
+                # special case: the node already exists
+                if len(elem) == len(child.elem):
+                    return (child, None, None)
+                # recourse down the trie
+                return self.find_longest_match(elem[len(child.elem):], child)
+
+            # elem matches completely, implies that elem is shorter
+            # than child.elem. Split child at len(elem)
+            if child.elem.startswith(elem):
+                return (child, len(elem), None)
+
+            # child does not match completely but at least first char matches
+            # find longest split index
+            for i in range(len(elem)):
+                if elem[i] == child.elem[i]: continue
+                else: return (child, i, elem[i:])
+
+        # No child(-prefix) matched, create another child
+        return (node, None, elem)
+
+    def to_dot(self):
+        print("graph {")
+        self._to_dot(self.root)
+        print("}")
+
+    def _to_dot(self, node):
+        for child in node.children:
+            if not node.elem: print(f'root -- "{child.elem}";')
+            else: print(f'"{node.elem}" -- "{child.elem}";')
+
+            if child.leaf:
+                print(f'"{child.elem}" [color=blue];')
+
+            self._to_dot(child)
+
+class Node:
+    def __init__(self, elem=None, parent=None, children=[],
+                 leaf=False, offset=0, title=None, info=None):
+        self.elem = elem
+        self.parent = parent
+        self.children = children
+        self.leaf = leaf
+
+        # payload
+        self.offset = offset
+        self.title = title if title else elem
+        self.info = info
diff --git a/timezone/data/util/preprocessor.py b/timezone/data/util/preprocessor.py
new file mode 100644
index 0000000..d23bae5
--- /dev/null
+++ b/timezone/data/util/preprocessor.py
@@ -0,0 +1,32 @@
+# Create Patricia Tries from various datasets
+#
+# Each Trie leaf has a timezone assigned which
+# may be a fixed UTC offset or a tz timezone
+
+import csv
+from patricia_trie import PatriciaTrie
+
+# Geonames from
+# http://download.geonames.org/export/dump/allCountries.zip
+def geonames_allcountries(path):
+    patricia = PatriciaTrie()
+
+    with open(path) as all_countries_csv:
+        reader = csv.reader(all_countries_csv, delimiter='\t')
+        for row in reader:
+            if row[6] != "P": continue
+
+            place_clean = row[2].replace(' ', '')
+            patricia.add(place_clean)
+
+    return patricia
+
+# Timezone abbreviations from
+# https://www.timeanddate.com/time/zones/
+def timezone_abbreviations():
+    return
+
+# Timezones from
+# https://www.iana.org/time-zones
+def tz_zones():
+    return
diff --git a/timezone/data/util/test.py b/timezone/data/util/test.py
new file mode 100644
index 0000000..e6b0660
--- /dev/null
+++ b/timezone/data/util/test.py
@@ -0,0 +1,7 @@
+import preprocessor
+import sys
+
+p = preprocessor.geonames_allcountries("/home/armin/Downloads/allCountries/allCountries10000.txt")
+
+with open("/home/armin/Desktop/test.dot", "w") as sys.stdout:
+    p.to_dot()
diff --git a/timezone/search/__init__.py b/timezone/search/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/timezone/search/trie.py b/timezone/search/trie.py
new file mode 100644
index 0000000..15fb25a
--- /dev/null
+++ b/timezone/search/trie.py
@@ -0,0 +1,300 @@
+"""Radix Trie with radix 256
+
+A Radix Trie[1] - once built - allows efficient prefix search. The trie works
+on byte strings and hence is oblivious to encoding. The encoding for creation
+and search must match. Payload of each node can be an arbitrary object.
+
+Usage
+-----
+.. code :: python
+    t = Trie()
+    t.add("Hello", "P1")
+    t.add("Hi", "P2")
+    t.add("Hela", "P3")
+    t.find("He") # ["P1", "P3"]
+
+[1] https://en.wikipedia.org/wiki/Radix_tree
+"""
+
+from __future__ import annotations
+from typing import Sequence, MutableSequence, ByteString, Any, Optional
+from abc import ABC, abstractmethod
+
+import logging
+log = logging.getLogger(__name__)
+
+class Trie:
+    def __init__(self, multi_value=False):
+        self.root = Root([])
+        self.multi_value = multi_value
+
+    def insert(self, label: ByteString, content: Any):
+        log.info(f"Inserting {label} into Trie")
+        start = self.root.child_by_common_prefix(label)
+        if not start:
+            log.debug(f"Creating new terminal for {label} at root")
+            new_node = Terminal(label, content, self.root, [], self.multi_value)
+            self.root.put_child(new_node)
+            return new_node
+        log.debug(f"Found match {start} for {label}. Traversing down")
+        self._insert(start, label, content)
+
+    def _insert(self, node, label, content):
+        log.info(f"Inserting {label} into Trie at {node}")
+        if node.has_label(label):
+            log.debug(f"{node} equals {label}. Wrapping node as Terminal.")
+            if isinstance(node, Terminal) and not self.multi_value:
+                log.warning(f"{node} is already a Terminal. Content will be overwritten.")
+            terminal = Terminal.from_child(node, content, self.multi_value)
+            node.replace_with(terminal)
+            return terminal
+
+        if node.is_prefix_of(label):
+            log.debug(f"{node} is prefix of {label}")
+            cutoff = node.cut_from(label)
+            next_node = node.child_by_common_prefix(cutoff)
+            if not next_node:
+                log.debug(f"No matching child found for {cutoff}. Creating new child terminal.")
+                terminal = Terminal(cutoff, content, node, [], self.multi_value)
+                node.put_child(terminal)
+                return terminal
+            else:
+                log.debug(f"Found match {next_node} for {cutoff}. Traversing down.")
+                return self._insert(next_node, cutoff, content)
+
+        if node.starts_with(label):
+            log.debug(f"{label} is part of {node}. Creating new parent from {label}")
+            new_node = Terminal(label, content, node.parent, [], self.multi_value)
+            node.replace_with(new_node)
+            node.strip_prefix(label)
+            new_node.put_child(node)
+            return new_node
+
+        log.debug(f"{label} and {node} have a common ancestor")
+        common_prefix = node.common_prefix(label)
+        log.debug(f"Creating new ancestor for {common_prefix}")
+        ancestor = Child(common_prefix, node.parent, [])
+        node.replace_with(ancestor)
+        terminal = Terminal(cut_off_prefix(common_prefix, label), content, ancestor, [], self.multi_value)
+        node.strip_prefix(common_prefix)
+        ancestor.put_child(terminal)
+        ancestor.put_child(node)
+        return terminal
+
+    def find(self, prefix):
+        node = self._find(self.root, prefix)
+        return self._get_terminals(node, prefix)
+
+    def _find(self, node, prefix, collector=""):
+        cutoff = node.cut_from(prefix)
+        log.debug(f"Searching for {cutoff} in {node}")
+        child = node.child_by_prefix_match(cutoff)
+        if not child and not cutoff:
+            return node
+        elif not child and cutoff:
+            log.debug(f"Leftover cutoff {cutoff}. Trying to find node with prefix {cutoff}")
+            child = node.child_by_common_prefix(cutoff)
+            if not child or not child.starts_with(cutoff):
+                return None
+            log.debug(f"Found child {child} starting with {cutoff}")
+            return child
+        else: # child must be not None
+            log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.")
+            return self._find(child, cutoff)
+
+    def _get_terminals(self, node, label_builder):
+        if not node: return []
+
+        collector = []
+        if isinstance(node, Terminal):
+            collector.append((node, label_builder))
+        for child in node.children:
+            l = child.extend(label_builder)
+            collector.extend(self._get_terminals(child, l))
+        return collector
+
+    def to_dot(self) -> str:
+        return "graph {\n\n"+self.root.to_dot()+"\n}"
+
+def has_common_prefix(label: ByteString, other_label: ByteString) -> bool:
+    """ Whether label and other_label have a prefix in common. """
+    assert label and other_label
+    return True if label[0] == other_label[0] else False
+
+def common_prefix(label: ByteString, other_label: ByteString) -> ByteString:
+    """ Get the common prefix of label and other_label. """
+    buffer = bytearray()
+    for (a,b) in zip(label, other_label):
+        if a == b: buffer.append(a)
+        else: break
+    return buffer
+
+def is_prefix_of(prefix: ByteString, label: ByteString) -> bool:
+    """ Whether label starts with prefix """
+    if len(prefix) > len(label):
+        return False
+    for (a,b) in zip(prefix, label):
+        if a != b: return False
+    return True
+
+def find_first(predicate, iterable):
+    """ Return the first element in iterable that satisfies predicate or None """
+    try: return next(filter(predicate, iterable))
+    except StopIteration: return None
+
+def cut_off_prefix(prefix: ByteString, label: ByteString) -> ByteString:
+    """ Cut prefix from start of label. Return rest of label. """
+    assert is_prefix_of(prefix, label)
+    return bytes(label[len(prefix):])
+
+class Node(ABC):
+    def __init__(self, children: MutableSequence[Child]):
+        self.children = children
+
+    def child_by_common_prefix(self, label: ByteString) -> Optional[Child]:
+        """ Return Child that has a common prefix with label if one exists. """
+        def by_common_prefix(child: Child):
+            return has_common_prefix(child.label, label)
+        return find_first(by_common_prefix, self.children)
+
+    def child_by_prefix_match(self, label: ByteString) -> Optional[Child]:
+        """ Return Child which label is a prefix of the given label if one exists. """
+        def by_prefix_match(child: Child):
+            return is_prefix_of(child.label, label)
+        return find_first(by_prefix_match, self.children)
+
+    def put_child(self, child: Child):
+        """ Put child into this node's children. Replacing existing children. """
+        if child in self.children:
+            log.warning(f"Replacing child {child.label}")
+            self.remove_child(child)
+        child.parent = self
+        self.children.append(child)
+
+    def replace_child(self, child: Child, replacement: Child):
+        """ Remove child from this node's children and add replacement. """
+        self.remove_child(child)
+        self.put_child(replacement)
+
+    def remove_child(self, child: Child):
+        """ Remove child from this node's children """
+        if not child in self.children:
+            log.warning(f"Trying to delete {child.label} but it does not exist.")
+        self.children.remove(child)
+
+    @abstractmethod
+    def dot_label(self) -> str:
+        """ Readable label for this node in a dot graph """
+        ...
+
+    @abstractmethod
+    def dot_id(self) -> str:
+        """ Technical id for this node in a dot graph. Must be unique. """
+        ...
+
+    @abstractmethod
+    def cut_from(self, label: ByteString) -> ByteString:
+        """ Cut off node's label considered as prefix from label. """
+        ...
+
+    def to_dot(self) -> str:
+        s = f'{self.dot_id()} [label="{self.dot_label()}"]\n'
+        for child in self.children:
+            s += f"{self.dot_id()} -- {child.dot_id()}\n"
+            s += child.to_dot()
+        return s
+
+class Root(Node):
+    def cut_from(self, label: ByteString) -> ByteString:
+        return label
+
+    def dot_label(self):
+        return "root"
+
+    def dot_id(self):
+        return "root"
+
+class Child(Node):
+    def __init__(self, label: ByteString, parent: Node, children: MutableSequence[Child]):
+        self.label = label
+        self.parent = parent
+        self.children = children
+
+    def __eq__(self, other_child):
+        return (isinstance(other_child, Child)
+                and self.label == other_child.label)
+
+    def __hash__(self):
+        return hash(self.label)
+
+    def __str__(self):
+        return self.label.decode('utf-8', 'replace').replace('"', '\\"')
+
+    def dot_label(self):
+        return self.label.decode('utf-8', 'replace').replace('"', '\\"')
+
+    def dot_id(self):
+        return id(self)
+
+    def has_label(self, label):
+        return self.label == label
+
+    def is_prefix_of(self, label):
+        return is_prefix_of(self.label, label)
+
+    def replace_with(self, new_child: Child):
+        new_child.parent = self.parent
+        self.parent.replace_child(self, new_child)
+
+    def starts_with(self, label: ByteString) -> bool:
+        return is_prefix_of(label, self.label)
+
+    def cut_from(self, label: ByteString) -> ByteString:
+        """ Cut node's label from (start of) label """
+        return cut_off_prefix(self.label, label)
+
+    def strip_prefix(self, prefix: ByteString):
+        """ Cut off prefix from node's label """
+        self.label = cut_off_prefix(prefix, self.label)
+
+    def extend(self, label: ByteString) -> ByteString:
+        """ Extend label by node's label """
+        return bytes(label) + bytes(self.label)
+
+    def split_label_at(self, index):
+        return (self.label[:index], self.label[index:])
+
+    def contains(self, label):
+        if len(label) > len(self.label):
+            return False
+        for (a,b) in zip(self.label, label):
+            if a != b: return False
+        return True
+
+    def common_prefix(self, label):
+        return common_prefix(self.label, label)
+
+class Terminal(Child):
+    def __init__(self, label: ByteString, content: Any, parent: Node, children: MutableSequence[Child], multi_value: bool):
+        super().__init__(label, parent, children)
+        self.multi_value = multi_value
+        self.content = [content] if multi_value else content
+
+    @classmethod
+    def from_child(cls, child: Child, content: Any, multi_value: bool):
+        # multi_value param has no effect if already a Terminal. I.e.
+        # from_child cannot change the multi-value stage of a child that
+        # is already a Terminal
+        if isinstance(child, Terminal) and child.multi_value:
+            # Create a new Terminal instance. Although not needed this is what is expected
+            # and compatible to the non-multi-value behaviour.
+            t = cls(child.label, content, child.parent, child.children, child.multi_value)
+            t.content.extend(child.content) # add back original content
+            return t
+        return cls(child.label, content, child.parent, child.children, multi_value)
+
+    def to_dot(self) -> str:
+        s = super().to_dot()
+        s += f"{self.dot_id()} [color=blue]\n"
+        return s
+
diff --git a/timezone/timezone.py b/timezone/timezone.py
new file mode 100644
index 0000000..3ea3428
--- /dev/null
+++ b/timezone/timezone.py
@@ -0,0 +1,29 @@
+import csv
+from search.trie import Trie
+
+def load_geonames():
+    t = Trie(multi_value=True)
+    with open("data/cities500.txt", "r") as f:
+        reader = csv.reader(f, delimiter='\t')
+        for i, row in enumerate(reader):
+            try:
+                t.insert(row[1].encode("utf-8"), row[17])
+            except Exception:
+                print(f"Error in row {i}")
+                print(f"Label: '{row[1]}'")
+                print(f"Type: {type(row[1])}")
+                raise
+    return t
+
+def check_geonames():
+    with open("data/cities500.txt", "r") as f:
+        reader = csv.reader(f, delimiter='\t')
+        for i, row in enumerate(reader):
+            try:
+                if row[1].endswith("lea"):
+                    print(f"{i}: {row[1]} \t\t\t {row[17]}")
+            except Exception:
+                print(f"Error in row {i}")
+                print(f"Label: '{row[1]}'")
+                print(f"Type: {type(row[1])}")
+                raise