Add timezone api

2020-10-26 11:47:07 +01:00 · 2020-10-26 11:47:07 +01:00 · 4c88eb4121
commit 4c88eb4121
parent b410540f7f
9 changed files with 542 additions and 0 deletions
--- a/timezone/init.py
+++ b/timezone/init.py
@ -0,0 +1,11 @@
 """ Timezone management
 Provides:
 - `search.py`: Fast prefix search for timezones
 - `timezone.py`: Conversion functions
 - `api.py`: A REST API for the functionality provided by this package
 """
 from flask import Blueprint
 app = Blueprint('timezone', __name__, template_folder='templates')
 from . import api
--- a/timezone/api.py
+++ b/timezone/api.py
@ -0,0 +1,15 @@
 from flask import request, jsonify
 from time import time
 import uuid
 import struct
 from . import app
@app.route('/api/v1/autocomplete', methods=['GET'])
 def autocomplete_timezone():
    complete_str = request.args.get('complete')
    if not complete_str: return "No part"
    return complete_str
--- a/timezone/data/search.py
+++ b/timezone/data/search.py
@ -0,0 +1 @@
 def suggest(prefix):
--- a/timezone/data/util/patricia_trie.py
+++ b/timezone/data/util/patricia_trie.py
@ -0,0 +1,147 @@
 import logging
 class PatriciaTrie:
    def __init__(self):
        self.root = Node()
    def find(self, prefix, node=None, collector=""):
        if not node: return self.find(prefix, self.root)
        logging.debug(f"Looking for prefix {prefix} at {node.elem}")
        if not prefix:
            res = []
            if node.leaf:
                logging.debug(f"Found leaf {node.elem}")
                res.append(collector)
            for child in node.children:
                logging.debug(f"Looking for leafs in {node.elem}")
                res.extend(self._find(prefix, child, collector+child.elem))
            logging.debug(f"Result for {node.elem}: {res}")
            return res
        for child in node.children:
            if prefix.startswith(child.elem):
                return self._find(prefix[len(child.elem):], child, collector+child.elem)
        return []
    def add(self, elem):
        (node, split_idx, elem_rest) = self.find_longest_match(elem, self.root)
        def new_child():
            return Node(elem=elem_rest, parent=node, leaf=True, children=[])
        def split_node(leaf):
            (oelem, ochild, oleaf) = (node.elem, node.children, node.leaf)
            node.leaf = leaf
            node.elem = oelem[:split_idx]
            node.children = []
            node.children.append(Node(elem=oelem[split_idx:], parent=node, leaf=oleaf, children=ochild))
        # elem already found in trie
        # just make sure node is marked as leaf
        if not split_idx and not elem_rest:
            node.leaf = True
            return
        # - elem not in trie
        # - parent node exhausted
        # This can happen if parent is root, or elem is larger than
        # largest matching elem in trie so far
        if not split_idx:
            node.children.append(Node(elem=elem_rest, parent=node, leaf=True, children=[]))
            return
        # - elem already found in trie
        # - elem ends in the middle of a node
        # This can happen if an existing node up to index and its
        # parents make up the entire elem. We need to split
        # the node at split_idx and mark it as leaf.
        if not elem_rest:
            old_elem = node.elem
            old_children = node.children
            old_leaf = node.leaf
            node.leaf = True
            node.elem = old_elem[:split_idx]
            node.children = []
            split_node = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
            node.children.append(split_node)
            return
        # - elem not found in trie
        # - node up to split_idx and its parent make up elem
        # Node needs to be split at split_idx (preserving leaf status for split off old node) and
        # a new child is added for elem
        old_children = node.children
        old_leaf = node.leaf
        node.leaf = False
        node.elem = old_elem[:split_idx]
        node.children = []
        node_a = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
        node_b = Node(elem=elem_rest, parent=node, leaf=True, children=[])
        node.children.append(node_a)
        node.children.append(node_b)
    def find_longest_match(self, elem, node):
        for child in node.children:
            if not child.elem or not elem: continue
            # child does not match
            if child.elem[0] is not elem[0]: continue
            # child matches completely
            if elem.startswith(child.elem):
                # special case: the node already exists
                if len(elem) == len(child.elem):
                    return (child, None, None)
                # recourse down the trie
                return self.find_longest_match(elem[len(child.elem):], child)
            # elem matches completely, implies that elem is shorter
            # than child.elem. Split child at len(elem)
            if child.elem.startswith(elem):
                return (child, len(elem), None)
            # child does not match completely but at least first char matches
            # find longest split index
            for i in range(len(elem)):
                if elem[i] == child.elem[i]: continue
                else: return (child, i, elem[i:])
        # No child(-prefix) matched, create another child
        return (node, None, elem)
    def to_dot(self):
        print("graph {")
        self._to_dot(self.root)
        print("}")
    def _to_dot(self, node):
        for child in node.children:
            if not node.elem: print(f'root -- "{child.elem}";')
            else: print(f'"{node.elem}" -- "{child.elem}";')
            if child.leaf:
                print(f'"{child.elem}" [color=blue];')
            self._to_dot(child)
 class Node:
    def __init__(self, elem=None, parent=None, children=[],
                 leaf=False, offset=0, title=None, info=None):
        self.elem = elem
        self.parent = parent
        self.children = children
        self.leaf = leaf
        # payload
        self.offset = offset
        self.title = title if title else elem
        self.info = info
--- a/timezone/data/util/preprocessor.py
+++ b/timezone/data/util/preprocessor.py
@ -0,0 +1,32 @@
 # Create Patricia Tries from various datasets
 #
 # Each Trie leaf has a timezone assigned which
 # may be a fixed UTC offset or a tz timezone
 import csv
 from patricia_trie import PatriciaTrie
 # Geonames from
 # http://download.geonames.org/export/dump/allCountries.zip
 def geonames_allcountries(path):
    patricia = PatriciaTrie()
    with open(path) as all_countries_csv:
        reader = csv.reader(all_countries_csv, delimiter='\t')
        for row in reader:
            if row[6] != "P": continue
            place_clean = row[2].replace(' ', '')
            patricia.add(place_clean)
    return patricia
 # Timezone abbreviations from
 # https://www.timeanddate.com/time/zones/
 def timezone_abbreviations():
    return
 # Timezones from
 # https://www.iana.org/time-zones
 def tz_zones():
    return
--- a/timezone/data/util/test.py
+++ b/timezone/data/util/test.py
@ -0,0 +1,7 @@
 import preprocessor
 import sys
 p = preprocessor.geonames_allcountries("/home/armin/Downloads/allCountries/allCountries10000.txt")
 with open("/home/armin/Desktop/test.dot", "w") as sys.stdout:
    p.to_dot()
--- a/timezone/search/init.py
+++ b/timezone/search/init.py
--- a/timezone/search/trie.py
+++ b/timezone/search/trie.py
@ -0,0 +1,300 @@
 """Radix Trie with radix 256
 A Radix Trie[1] - once built - allows efficient prefix search. The trie works
 on byte strings and hence is oblivious to encoding. The encoding for creation
 and search must match. Payload of each node can be an arbitrary object.
 Usage
 -----
 .. code :: python
    t = Trie()
    t.add("Hello", "P1")
    t.add("Hi", "P2")
    t.add("Hela", "P3")
    t.find("He") # ["P1", "P3"]
 [1] https://en.wikipedia.org/wiki/Radix_tree
 """
 from __future__ import annotations
 from typing import Sequence, MutableSequence, ByteString, Any, Optional
 from abc import ABC, abstractmethod
 import logging
 log = logging.getLogger(__name__)
 class Trie:
    def __init__(self, multi_value=False):
        self.root = Root([])
        self.multi_value = multi_value
    def insert(self, label: ByteString, content: Any):
        log.info(f"Inserting {label} into Trie")
        start = self.root.child_by_common_prefix(label)
        if not start:
            log.debug(f"Creating new terminal for {label} at root")
            new_node = Terminal(label, content, self.root, [], self.multi_value)
            self.root.put_child(new_node)
            return new_node
        log.debug(f"Found match {start} for {label}. Traversing down")
        self._insert(start, label, content)
    def _insert(self, node, label, content):
        log.info(f"Inserting {label} into Trie at {node}")
        if node.has_label(label):
            log.debug(f"{node} equals {label}. Wrapping node as Terminal.")
            if isinstance(node, Terminal) and not self.multi_value:
                log.warning(f"{node} is already a Terminal. Content will be overwritten.")
            terminal = Terminal.from_child(node, content, self.multi_value)
            node.replace_with(terminal)
            return terminal
        if node.is_prefix_of(label):
            log.debug(f"{node} is prefix of {label}")
            cutoff = node.cut_from(label)
            next_node = node.child_by_common_prefix(cutoff)
            if not next_node:
                log.debug(f"No matching child found for {cutoff}. Creating new child terminal.")
                terminal = Terminal(cutoff, content, node, [], self.multi_value)
                node.put_child(terminal)
                return terminal
            else:
                log.debug(f"Found match {next_node} for {cutoff}. Traversing down.")
                return self._insert(next_node, cutoff, content)
        if node.starts_with(label):
            log.debug(f"{label} is part of {node}. Creating new parent from {label}")
            new_node = Terminal(label, content, node.parent, [], self.multi_value)
            node.replace_with(new_node)
            node.strip_prefix(label)
            new_node.put_child(node)
            return new_node
        log.debug(f"{label} and {node} have a common ancestor")
        common_prefix = node.common_prefix(label)
        log.debug(f"Creating new ancestor for {common_prefix}")
        ancestor = Child(common_prefix, node.parent, [])
        node.replace_with(ancestor)
        terminal = Terminal(cut_off_prefix(common_prefix, label), content, ancestor, [], self.multi_value)
        node.strip_prefix(common_prefix)
        ancestor.put_child(terminal)
        ancestor.put_child(node)
        return terminal
    def find(self, prefix):
        node = self._find(self.root, prefix)
        return self._get_terminals(node, prefix)
    def _find(self, node, prefix, collector=""):
        cutoff = node.cut_from(prefix)
        log.debug(f"Searching for {cutoff} in {node}")
        child = node.child_by_prefix_match(cutoff)
        if not child and not cutoff:
            return node
        elif not child and cutoff:
            log.debug(f"Leftover cutoff {cutoff}. Trying to find node with prefix {cutoff}")
            child = node.child_by_common_prefix(cutoff)
            if not child or not child.starts_with(cutoff):
                return None
            log.debug(f"Found child {child} starting with {cutoff}")
            return child
        else: # child must be not None
            log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.")
            return self._find(child, cutoff)
    def _get_terminals(self, node, label_builder):
        if not node: return []
        collector = []
        if isinstance(node, Terminal):
            collector.append((node, label_builder))
        for child in node.children:
            l = child.extend(label_builder)
            collector.extend(self._get_terminals(child, l))
        return collector
    def to_dot(self) -> str:
        return "graph {\n\n"+self.root.to_dot()+"\n}"
 def has_common_prefix(label: ByteString, other_label: ByteString) -> bool:
    """ Whether label and other_label have a prefix in common. """
    assert label and other_label
    return True if label[0] == other_label[0] else False
 def common_prefix(label: ByteString, other_label: ByteString) -> ByteString:
    """ Get the common prefix of label and other_label. """
    buffer = bytearray()
    for (a,b) in zip(label, other_label):
        if a == b: buffer.append(a)
        else: break
    return buffer
 def is_prefix_of(prefix: ByteString, label: ByteString) -> bool:
    """ Whether label starts with prefix """
    if len(prefix) > len(label):
        return False
    for (a,b) in zip(prefix, label):
        if a != b: return False
    return True
 def find_first(predicate, iterable):
    """ Return the first element in iterable that satisfies predicate or None """
    try: return next(filter(predicate, iterable))
    except StopIteration: return None
 def cut_off_prefix(prefix: ByteString, label: ByteString) -> ByteString:
    """ Cut prefix from start of label. Return rest of label. """
    assert is_prefix_of(prefix, label)
    return bytes(label[len(prefix):])
 class Node(ABC):
    def __init__(self, children: MutableSequence[Child]):
        self.children = children
    def child_by_common_prefix(self, label: ByteString) -> Optional[Child]:
        """ Return Child that has a common prefix with label if one exists. """
        def by_common_prefix(child: Child):
            return has_common_prefix(child.label, label)
        return find_first(by_common_prefix, self.children)
    def child_by_prefix_match(self, label: ByteString) -> Optional[Child]:
        """ Return Child which label is a prefix of the given label if one exists. """
        def by_prefix_match(child: Child):
            return is_prefix_of(child.label, label)
        return find_first(by_prefix_match, self.children)
    def put_child(self, child: Child):
        """ Put child into this node's children. Replacing existing children. """
        if child in self.children:
            log.warning(f"Replacing child {child.label}")
            self.remove_child(child)
        child.parent = self
        self.children.append(child)
    def replace_child(self, child: Child, replacement: Child):
        """ Remove child from this node's children and add replacement. """
        self.remove_child(child)
        self.put_child(replacement)
    def remove_child(self, child: Child):
        """ Remove child from this node's children """
        if not child in self.children:
            log.warning(f"Trying to delete {child.label} but it does not exist.")
        self.children.remove(child)
    @abstractmethod
    def dot_label(self) -> str:
        """ Readable label for this node in a dot graph """
        ...
    @abstractmethod
    def dot_id(self) -> str:
        """ Technical id for this node in a dot graph. Must be unique. """
        ...
    @abstractmethod
    def cut_from(self, label: ByteString) -> ByteString:
        """ Cut off node's label considered as prefix from label. """
        ...
    def to_dot(self) -> str:
        s = f'{self.dot_id()} [label="{self.dot_label()}"]\n'
        for child in self.children:
            s += f"{self.dot_id()} -- {child.dot_id()}\n"
            s += child.to_dot()
        return s
 class Root(Node):
    def cut_from(self, label: ByteString) -> ByteString:
        return label
    def dot_label(self):
        return "root"
    def dot_id(self):
        return "root"
 class Child(Node):
    def __init__(self, label: ByteString, parent: Node, children: MutableSequence[Child]):
        self.label = label
        self.parent = parent
        self.children = children
    def __eq__(self, other_child):
        return (isinstance(other_child, Child)
                and self.label == other_child.label)
    def __hash__(self):
        return hash(self.label)
    def __str__(self):
        return self.label.decode('utf-8', 'replace').replace('"', '\\"')
    def dot_label(self):
        return self.label.decode('utf-8', 'replace').replace('"', '\\"')
    def dot_id(self):
        return id(self)
    def has_label(self, label):
        return self.label == label
    def is_prefix_of(self, label):
        return is_prefix_of(self.label, label)
    def replace_with(self, new_child: Child):
        new_child.parent = self.parent
        self.parent.replace_child(self, new_child)
    def starts_with(self, label: ByteString) -> bool:
        return is_prefix_of(label, self.label)
    def cut_from(self, label: ByteString) -> ByteString:
        """ Cut node's label from (start of) label """
        return cut_off_prefix(self.label, label)
    def strip_prefix(self, prefix: ByteString):
        """ Cut off prefix from node's label """
        self.label = cut_off_prefix(prefix, self.label)
    def extend(self, label: ByteString) -> ByteString:
        """ Extend label by node's label """
        return bytes(label) + bytes(self.label)
    def split_label_at(self, index):
        return (self.label[:index], self.label[index:])
    def contains(self, label):
        if len(label) > len(self.label):
            return False
        for (a,b) in zip(self.label, label):
            if a != b: return False
        return True
    def common_prefix(self, label):
        return common_prefix(self.label, label)
 class Terminal(Child):
    def __init__(self, label: ByteString, content: Any, parent: Node, children: MutableSequence[Child], multi_value: bool):
        super().__init__(label, parent, children)
        self.multi_value = multi_value
        self.content = [content] if multi_value else content
    @classmethod
    def from_child(cls, child: Child, content: Any, multi_value: bool):
        # multi_value param has no effect if already a Terminal. I.e.
        # from_child cannot change the multi-value stage of a child that
        # is already a Terminal
        if isinstance(child, Terminal) and child.multi_value:
            # Create a new Terminal instance. Although not needed this is what is expected
            # and compatible to the non-multi-value behaviour.
            t = cls(child.label, content, child.parent, child.children, child.multi_value)
            t.content.extend(child.content) # add back original content
            return t
        return cls(child.label, content, child.parent, child.children, multi_value)
    def to_dot(self) -> str:
        s = super().to_dot()
        s += f"{self.dot_id()} [color=blue]\n"
        return s
--- a/timezone/timezone.py
+++ b/timezone/timezone.py
@ -0,0 +1,29 @@
 import csv
 from search.trie import Trie
 def load_geonames():
    t = Trie(multi_value=True)
    with open("data/cities500.txt", "r") as f:
        reader = csv.reader(f, delimiter='\t')
        for i, row in enumerate(reader):
            try:
                t.insert(row[1].encode("utf-8"), row[17])
            except Exception:
                print(f"Error in row {i}")
                print(f"Label: '{row[1]}'")
                print(f"Type: {type(row[1])}")
                raise
    return t
 def check_geonames():
    with open("data/cities500.txt", "r") as f:
        reader = csv.reader(f, delimiter='\t')
        for i, row in enumerate(reader):
            try:
                if row[1].endswith("lea"):
                    print(f"{i}: {row[1]} \t\t\t {row[17]}")
            except Exception:
                print(f"Error in row {i}")
                print(f"Label: '{row[1]}'")
                print(f"Type: {type(row[1])}")
                raise