From 4c88eb41217118b44256fa07ab8c4a70c5034162 Mon Sep 17 00:00:00 2001 From: Armin Friedl Date: Mon, 26 Oct 2020 11:47:07 +0100 Subject: [PATCH] Add timezone api --- timezone/__init__.py | 11 + timezone/api.py | 15 ++ timezone/data/search.py | 1 + timezone/data/util/patricia_trie.py | 147 ++++++++++++++ timezone/data/util/preprocessor.py | 32 +++ timezone/data/util/test.py | 7 + timezone/search/__init__.py | 0 timezone/search/trie.py | 300 ++++++++++++++++++++++++++++ timezone/timezone.py | 29 +++ 9 files changed, 542 insertions(+) create mode 100644 timezone/__init__.py create mode 100644 timezone/api.py create mode 100644 timezone/data/search.py create mode 100644 timezone/data/util/patricia_trie.py create mode 100644 timezone/data/util/preprocessor.py create mode 100644 timezone/data/util/test.py create mode 100644 timezone/search/__init__.py create mode 100644 timezone/search/trie.py create mode 100644 timezone/timezone.py diff --git a/timezone/__init__.py b/timezone/__init__.py new file mode 100644 index 0000000..ac918b4 --- /dev/null +++ b/timezone/__init__.py @@ -0,0 +1,11 @@ +""" Timezone management + +Provides: +- `search.py`: Fast prefix search for timezones +- `timezone.py`: Conversion functions +- `api.py`: A REST API for the functionality provided by this package +""" +from flask import Blueprint + +app = Blueprint('timezone', __name__, template_folder='templates') +from . import api diff --git a/timezone/api.py b/timezone/api.py new file mode 100644 index 0000000..593f192 --- /dev/null +++ b/timezone/api.py @@ -0,0 +1,15 @@ +from flask import request, jsonify +from time import time + +import uuid +import struct + +from . import app + +@app.route('/api/v1/autocomplete', methods=['GET']) +def autocomplete_timezone(): + complete_str = request.args.get('complete') + if not complete_str: return "No part" + + return complete_str + diff --git a/timezone/data/search.py b/timezone/data/search.py new file mode 100644 index 0000000..a252538 --- /dev/null +++ b/timezone/data/search.py @@ -0,0 +1 @@ +def suggest(prefix): diff --git a/timezone/data/util/patricia_trie.py b/timezone/data/util/patricia_trie.py new file mode 100644 index 0000000..32e781e --- /dev/null +++ b/timezone/data/util/patricia_trie.py @@ -0,0 +1,147 @@ +import logging + +class PatriciaTrie: + def __init__(self): + self.root = Node() + + def find(self, prefix, node=None, collector=""): + if not node: return self.find(prefix, self.root) + + logging.debug(f"Looking for prefix {prefix} at {node.elem}") + if not prefix: + res = [] + if node.leaf: + logging.debug(f"Found leaf {node.elem}") + res.append(collector) + for child in node.children: + logging.debug(f"Looking for leafs in {node.elem}") + res.extend(self._find(prefix, child, collector+child.elem)) + logging.debug(f"Result for {node.elem}: {res}") + return res + + for child in node.children: + if prefix.startswith(child.elem): + return self._find(prefix[len(child.elem):], child, collector+child.elem) + + return [] + + def add(self, elem): + (node, split_idx, elem_rest) = self.find_longest_match(elem, self.root) + + def new_child(): + return Node(elem=elem_rest, parent=node, leaf=True, children=[]) + + def split_node(leaf): + (oelem, ochild, oleaf) = (node.elem, node.children, node.leaf) + node.leaf = leaf + node.elem = oelem[:split_idx] + node.children = [] + + node.children.append(Node(elem=oelem[split_idx:], parent=node, leaf=oleaf, children=ochild)) + + # elem already found in trie + # just make sure node is marked as leaf + if not split_idx and not elem_rest: + node.leaf = True + return + + # - elem not in trie + # - parent node exhausted + # This can happen if parent is root, or elem is larger than + # largest matching elem in trie so far + if not split_idx: + node.children.append(Node(elem=elem_rest, parent=node, leaf=True, children=[])) + return + + # - elem already found in trie + # - elem ends in the middle of a node + # This can happen if an existing node up to index and its + # parents make up the entire elem. We need to split + # the node at split_idx and mark it as leaf. + if not elem_rest: + old_elem = node.elem + old_children = node.children + old_leaf = node.leaf + + node.leaf = True + node.elem = old_elem[:split_idx] + node.children = [] + + split_node = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children) + + node.children.append(split_node) + return + + + # - elem not found in trie + # - node up to split_idx and its parent make up elem + # Node needs to be split at split_idx (preserving leaf status for split off old node) and + # a new child is added for elem + old_children = node.children + old_leaf = node.leaf + + node.leaf = False + node.elem = old_elem[:split_idx] + node.children = [] + + node_a = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children) + node_b = Node(elem=elem_rest, parent=node, leaf=True, children=[]) + node.children.append(node_a) + node.children.append(node_b) + + def find_longest_match(self, elem, node): + for child in node.children: + if not child.elem or not elem: continue + + # child does not match + if child.elem[0] is not elem[0]: continue + + # child matches completely + if elem.startswith(child.elem): + # special case: the node already exists + if len(elem) == len(child.elem): + return (child, None, None) + # recourse down the trie + return self.find_longest_match(elem[len(child.elem):], child) + + # elem matches completely, implies that elem is shorter + # than child.elem. Split child at len(elem) + if child.elem.startswith(elem): + return (child, len(elem), None) + + # child does not match completely but at least first char matches + # find longest split index + for i in range(len(elem)): + if elem[i] == child.elem[i]: continue + else: return (child, i, elem[i:]) + + # No child(-prefix) matched, create another child + return (node, None, elem) + + def to_dot(self): + print("graph {") + self._to_dot(self.root) + print("}") + + def _to_dot(self, node): + for child in node.children: + if not node.elem: print(f'root -- "{child.elem}";') + else: print(f'"{node.elem}" -- "{child.elem}";') + + if child.leaf: + print(f'"{child.elem}" [color=blue];') + + self._to_dot(child) + +class Node: + def __init__(self, elem=None, parent=None, children=[], + leaf=False, offset=0, title=None, info=None): + self.elem = elem + self.parent = parent + self.children = children + self.leaf = leaf + + # payload + self.offset = offset + self.title = title if title else elem + self.info = info diff --git a/timezone/data/util/preprocessor.py b/timezone/data/util/preprocessor.py new file mode 100644 index 0000000..d23bae5 --- /dev/null +++ b/timezone/data/util/preprocessor.py @@ -0,0 +1,32 @@ +# Create Patricia Tries from various datasets +# +# Each Trie leaf has a timezone assigned which +# may be a fixed UTC offset or a tz timezone + +import csv +from patricia_trie import PatriciaTrie + +# Geonames from +# http://download.geonames.org/export/dump/allCountries.zip +def geonames_allcountries(path): + patricia = PatriciaTrie() + + with open(path) as all_countries_csv: + reader = csv.reader(all_countries_csv, delimiter='\t') + for row in reader: + if row[6] != "P": continue + + place_clean = row[2].replace(' ', '') + patricia.add(place_clean) + + return patricia + +# Timezone abbreviations from +# https://www.timeanddate.com/time/zones/ +def timezone_abbreviations(): + return + +# Timezones from +# https://www.iana.org/time-zones +def tz_zones(): + return diff --git a/timezone/data/util/test.py b/timezone/data/util/test.py new file mode 100644 index 0000000..e6b0660 --- /dev/null +++ b/timezone/data/util/test.py @@ -0,0 +1,7 @@ +import preprocessor +import sys + +p = preprocessor.geonames_allcountries("/home/armin/Downloads/allCountries/allCountries10000.txt") + +with open("/home/armin/Desktop/test.dot", "w") as sys.stdout: + p.to_dot() diff --git a/timezone/search/__init__.py b/timezone/search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/timezone/search/trie.py b/timezone/search/trie.py new file mode 100644 index 0000000..15fb25a --- /dev/null +++ b/timezone/search/trie.py @@ -0,0 +1,300 @@ +"""Radix Trie with radix 256 + +A Radix Trie[1] - once built - allows efficient prefix search. The trie works +on byte strings and hence is oblivious to encoding. The encoding for creation +and search must match. Payload of each node can be an arbitrary object. + +Usage +----- +.. code :: python + t = Trie() + t.add("Hello", "P1") + t.add("Hi", "P2") + t.add("Hela", "P3") + t.find("He") # ["P1", "P3"] + +[1] https://en.wikipedia.org/wiki/Radix_tree +""" + +from __future__ import annotations +from typing import Sequence, MutableSequence, ByteString, Any, Optional +from abc import ABC, abstractmethod + +import logging +log = logging.getLogger(__name__) + +class Trie: + def __init__(self, multi_value=False): + self.root = Root([]) + self.multi_value = multi_value + + def insert(self, label: ByteString, content: Any): + log.info(f"Inserting {label} into Trie") + start = self.root.child_by_common_prefix(label) + if not start: + log.debug(f"Creating new terminal for {label} at root") + new_node = Terminal(label, content, self.root, [], self.multi_value) + self.root.put_child(new_node) + return new_node + log.debug(f"Found match {start} for {label}. Traversing down") + self._insert(start, label, content) + + def _insert(self, node, label, content): + log.info(f"Inserting {label} into Trie at {node}") + if node.has_label(label): + log.debug(f"{node} equals {label}. Wrapping node as Terminal.") + if isinstance(node, Terminal) and not self.multi_value: + log.warning(f"{node} is already a Terminal. Content will be overwritten.") + terminal = Terminal.from_child(node, content, self.multi_value) + node.replace_with(terminal) + return terminal + + if node.is_prefix_of(label): + log.debug(f"{node} is prefix of {label}") + cutoff = node.cut_from(label) + next_node = node.child_by_common_prefix(cutoff) + if not next_node: + log.debug(f"No matching child found for {cutoff}. Creating new child terminal.") + terminal = Terminal(cutoff, content, node, [], self.multi_value) + node.put_child(terminal) + return terminal + else: + log.debug(f"Found match {next_node} for {cutoff}. Traversing down.") + return self._insert(next_node, cutoff, content) + + if node.starts_with(label): + log.debug(f"{label} is part of {node}. Creating new parent from {label}") + new_node = Terminal(label, content, node.parent, [], self.multi_value) + node.replace_with(new_node) + node.strip_prefix(label) + new_node.put_child(node) + return new_node + + log.debug(f"{label} and {node} have a common ancestor") + common_prefix = node.common_prefix(label) + log.debug(f"Creating new ancestor for {common_prefix}") + ancestor = Child(common_prefix, node.parent, []) + node.replace_with(ancestor) + terminal = Terminal(cut_off_prefix(common_prefix, label), content, ancestor, [], self.multi_value) + node.strip_prefix(common_prefix) + ancestor.put_child(terminal) + ancestor.put_child(node) + return terminal + + def find(self, prefix): + node = self._find(self.root, prefix) + return self._get_terminals(node, prefix) + + def _find(self, node, prefix, collector=""): + cutoff = node.cut_from(prefix) + log.debug(f"Searching for {cutoff} in {node}") + child = node.child_by_prefix_match(cutoff) + if not child and not cutoff: + return node + elif not child and cutoff: + log.debug(f"Leftover cutoff {cutoff}. Trying to find node with prefix {cutoff}") + child = node.child_by_common_prefix(cutoff) + if not child or not child.starts_with(cutoff): + return None + log.debug(f"Found child {child} starting with {cutoff}") + return child + else: # child must be not None + log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.") + return self._find(child, cutoff) + + def _get_terminals(self, node, label_builder): + if not node: return [] + + collector = [] + if isinstance(node, Terminal): + collector.append((node, label_builder)) + for child in node.children: + l = child.extend(label_builder) + collector.extend(self._get_terminals(child, l)) + return collector + + def to_dot(self) -> str: + return "graph {\n\n"+self.root.to_dot()+"\n}" + +def has_common_prefix(label: ByteString, other_label: ByteString) -> bool: + """ Whether label and other_label have a prefix in common. """ + assert label and other_label + return True if label[0] == other_label[0] else False + +def common_prefix(label: ByteString, other_label: ByteString) -> ByteString: + """ Get the common prefix of label and other_label. """ + buffer = bytearray() + for (a,b) in zip(label, other_label): + if a == b: buffer.append(a) + else: break + return buffer + +def is_prefix_of(prefix: ByteString, label: ByteString) -> bool: + """ Whether label starts with prefix """ + if len(prefix) > len(label): + return False + for (a,b) in zip(prefix, label): + if a != b: return False + return True + +def find_first(predicate, iterable): + """ Return the first element in iterable that satisfies predicate or None """ + try: return next(filter(predicate, iterable)) + except StopIteration: return None + +def cut_off_prefix(prefix: ByteString, label: ByteString) -> ByteString: + """ Cut prefix from start of label. Return rest of label. """ + assert is_prefix_of(prefix, label) + return bytes(label[len(prefix):]) + +class Node(ABC): + def __init__(self, children: MutableSequence[Child]): + self.children = children + + def child_by_common_prefix(self, label: ByteString) -> Optional[Child]: + """ Return Child that has a common prefix with label if one exists. """ + def by_common_prefix(child: Child): + return has_common_prefix(child.label, label) + return find_first(by_common_prefix, self.children) + + def child_by_prefix_match(self, label: ByteString) -> Optional[Child]: + """ Return Child which label is a prefix of the given label if one exists. """ + def by_prefix_match(child: Child): + return is_prefix_of(child.label, label) + return find_first(by_prefix_match, self.children) + + def put_child(self, child: Child): + """ Put child into this node's children. Replacing existing children. """ + if child in self.children: + log.warning(f"Replacing child {child.label}") + self.remove_child(child) + child.parent = self + self.children.append(child) + + def replace_child(self, child: Child, replacement: Child): + """ Remove child from this node's children and add replacement. """ + self.remove_child(child) + self.put_child(replacement) + + def remove_child(self, child: Child): + """ Remove child from this node's children """ + if not child in self.children: + log.warning(f"Trying to delete {child.label} but it does not exist.") + self.children.remove(child) + + @abstractmethod + def dot_label(self) -> str: + """ Readable label for this node in a dot graph """ + ... + + @abstractmethod + def dot_id(self) -> str: + """ Technical id for this node in a dot graph. Must be unique. """ + ... + + @abstractmethod + def cut_from(self, label: ByteString) -> ByteString: + """ Cut off node's label considered as prefix from label. """ + ... + + def to_dot(self) -> str: + s = f'{self.dot_id()} [label="{self.dot_label()}"]\n' + for child in self.children: + s += f"{self.dot_id()} -- {child.dot_id()}\n" + s += child.to_dot() + return s + +class Root(Node): + def cut_from(self, label: ByteString) -> ByteString: + return label + + def dot_label(self): + return "root" + + def dot_id(self): + return "root" + +class Child(Node): + def __init__(self, label: ByteString, parent: Node, children: MutableSequence[Child]): + self.label = label + self.parent = parent + self.children = children + + def __eq__(self, other_child): + return (isinstance(other_child, Child) + and self.label == other_child.label) + + def __hash__(self): + return hash(self.label) + + def __str__(self): + return self.label.decode('utf-8', 'replace').replace('"', '\\"') + + def dot_label(self): + return self.label.decode('utf-8', 'replace').replace('"', '\\"') + + def dot_id(self): + return id(self) + + def has_label(self, label): + return self.label == label + + def is_prefix_of(self, label): + return is_prefix_of(self.label, label) + + def replace_with(self, new_child: Child): + new_child.parent = self.parent + self.parent.replace_child(self, new_child) + + def starts_with(self, label: ByteString) -> bool: + return is_prefix_of(label, self.label) + + def cut_from(self, label: ByteString) -> ByteString: + """ Cut node's label from (start of) label """ + return cut_off_prefix(self.label, label) + + def strip_prefix(self, prefix: ByteString): + """ Cut off prefix from node's label """ + self.label = cut_off_prefix(prefix, self.label) + + def extend(self, label: ByteString) -> ByteString: + """ Extend label by node's label """ + return bytes(label) + bytes(self.label) + + def split_label_at(self, index): + return (self.label[:index], self.label[index:]) + + def contains(self, label): + if len(label) > len(self.label): + return False + for (a,b) in zip(self.label, label): + if a != b: return False + return True + + def common_prefix(self, label): + return common_prefix(self.label, label) + +class Terminal(Child): + def __init__(self, label: ByteString, content: Any, parent: Node, children: MutableSequence[Child], multi_value: bool): + super().__init__(label, parent, children) + self.multi_value = multi_value + self.content = [content] if multi_value else content + + @classmethod + def from_child(cls, child: Child, content: Any, multi_value: bool): + # multi_value param has no effect if already a Terminal. I.e. + # from_child cannot change the multi-value stage of a child that + # is already a Terminal + if isinstance(child, Terminal) and child.multi_value: + # Create a new Terminal instance. Although not needed this is what is expected + # and compatible to the non-multi-value behaviour. + t = cls(child.label, content, child.parent, child.children, child.multi_value) + t.content.extend(child.content) # add back original content + return t + return cls(child.label, content, child.parent, child.children, multi_value) + + def to_dot(self) -> str: + s = super().to_dot() + s += f"{self.dot_id()} [color=blue]\n" + return s + diff --git a/timezone/timezone.py b/timezone/timezone.py new file mode 100644 index 0000000..3ea3428 --- /dev/null +++ b/timezone/timezone.py @@ -0,0 +1,29 @@ +import csv +from search.trie import Trie + +def load_geonames(): + t = Trie(multi_value=True) + with open("data/cities500.txt", "r") as f: + reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(reader): + try: + t.insert(row[1].encode("utf-8"), row[17]) + except Exception: + print(f"Error in row {i}") + print(f"Label: '{row[1]}'") + print(f"Type: {type(row[1])}") + raise + return t + +def check_geonames(): + with open("data/cities500.txt", "r") as f: + reader = csv.reader(f, delimiter='\t') + for i, row in enumerate(reader): + try: + if row[1].endswith("lea"): + print(f"{i}: {row[1]} \t\t\t {row[17]}") + except Exception: + print(f"Error in row {i}") + print(f"Label: '{row[1]}'") + print(f"Type: {type(row[1])}") + raise