Add timezone api
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Armin Friedl 2020-10-26 11:47:07 +01:00
parent b410540f7f
commit 4c88eb4121
Signed by: armin
GPG key ID: 48C726EEE7FBCBC8
9 changed files with 542 additions and 0 deletions

11
timezone/__init__.py Normal file
View file

@ -0,0 +1,11 @@
""" Timezone management
Provides:
- `search.py`: Fast prefix search for timezones
- `timezone.py`: Conversion functions
- `api.py`: A REST API for the functionality provided by this package
"""
from flask import Blueprint
app = Blueprint('timezone', __name__, template_folder='templates')
from . import api

15
timezone/api.py Normal file
View file

@ -0,0 +1,15 @@
from flask import request, jsonify
from time import time
import uuid
import struct
from . import app
@app.route('/api/v1/autocomplete', methods=['GET'])
def autocomplete_timezone():
complete_str = request.args.get('complete')
if not complete_str: return "No part"
return complete_str

1
timezone/data/search.py Normal file
View file

@ -0,0 +1 @@
def suggest(prefix):

View file

@ -0,0 +1,147 @@
import logging
class PatriciaTrie:
def __init__(self):
self.root = Node()
def find(self, prefix, node=None, collector=""):
if not node: return self.find(prefix, self.root)
logging.debug(f"Looking for prefix {prefix} at {node.elem}")
if not prefix:
res = []
if node.leaf:
logging.debug(f"Found leaf {node.elem}")
res.append(collector)
for child in node.children:
logging.debug(f"Looking for leafs in {node.elem}")
res.extend(self._find(prefix, child, collector+child.elem))
logging.debug(f"Result for {node.elem}: {res}")
return res
for child in node.children:
if prefix.startswith(child.elem):
return self._find(prefix[len(child.elem):], child, collector+child.elem)
return []
def add(self, elem):
(node, split_idx, elem_rest) = self.find_longest_match(elem, self.root)
def new_child():
return Node(elem=elem_rest, parent=node, leaf=True, children=[])
def split_node(leaf):
(oelem, ochild, oleaf) = (node.elem, node.children, node.leaf)
node.leaf = leaf
node.elem = oelem[:split_idx]
node.children = []
node.children.append(Node(elem=oelem[split_idx:], parent=node, leaf=oleaf, children=ochild))
# elem already found in trie
# just make sure node is marked as leaf
if not split_idx and not elem_rest:
node.leaf = True
return
# - elem not in trie
# - parent node exhausted
# This can happen if parent is root, or elem is larger than
# largest matching elem in trie so far
if not split_idx:
node.children.append(Node(elem=elem_rest, parent=node, leaf=True, children=[]))
return
# - elem already found in trie
# - elem ends in the middle of a node
# This can happen if an existing node up to index and its
# parents make up the entire elem. We need to split
# the node at split_idx and mark it as leaf.
if not elem_rest:
old_elem = node.elem
old_children = node.children
old_leaf = node.leaf
node.leaf = True
node.elem = old_elem[:split_idx]
node.children = []
split_node = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
node.children.append(split_node)
return
# - elem not found in trie
# - node up to split_idx and its parent make up elem
# Node needs to be split at split_idx (preserving leaf status for split off old node) and
# a new child is added for elem
old_children = node.children
old_leaf = node.leaf
node.leaf = False
node.elem = old_elem[:split_idx]
node.children = []
node_a = Node(elem=old_elem[split_idx:], parent=node, leaf=old_leaf, children=old_children)
node_b = Node(elem=elem_rest, parent=node, leaf=True, children=[])
node.children.append(node_a)
node.children.append(node_b)
def find_longest_match(self, elem, node):
for child in node.children:
if not child.elem or not elem: continue
# child does not match
if child.elem[0] is not elem[0]: continue
# child matches completely
if elem.startswith(child.elem):
# special case: the node already exists
if len(elem) == len(child.elem):
return (child, None, None)
# recourse down the trie
return self.find_longest_match(elem[len(child.elem):], child)
# elem matches completely, implies that elem is shorter
# than child.elem. Split child at len(elem)
if child.elem.startswith(elem):
return (child, len(elem), None)
# child does not match completely but at least first char matches
# find longest split index
for i in range(len(elem)):
if elem[i] == child.elem[i]: continue
else: return (child, i, elem[i:])
# No child(-prefix) matched, create another child
return (node, None, elem)
def to_dot(self):
print("graph {")
self._to_dot(self.root)
print("}")
def _to_dot(self, node):
for child in node.children:
if not node.elem: print(f'root -- "{child.elem}";')
else: print(f'"{node.elem}" -- "{child.elem}";')
if child.leaf:
print(f'"{child.elem}" [color=blue];')
self._to_dot(child)
class Node:
def __init__(self, elem=None, parent=None, children=[],
leaf=False, offset=0, title=None, info=None):
self.elem = elem
self.parent = parent
self.children = children
self.leaf = leaf
# payload
self.offset = offset
self.title = title if title else elem
self.info = info

View file

@ -0,0 +1,32 @@
# Create Patricia Tries from various datasets
#
# Each Trie leaf has a timezone assigned which
# may be a fixed UTC offset or a tz timezone
import csv
from patricia_trie import PatriciaTrie
# Geonames from
# http://download.geonames.org/export/dump/allCountries.zip
def geonames_allcountries(path):
patricia = PatriciaTrie()
with open(path) as all_countries_csv:
reader = csv.reader(all_countries_csv, delimiter='\t')
for row in reader:
if row[6] != "P": continue
place_clean = row[2].replace(' ', '')
patricia.add(place_clean)
return patricia
# Timezone abbreviations from
# https://www.timeanddate.com/time/zones/
def timezone_abbreviations():
return
# Timezones from
# https://www.iana.org/time-zones
def tz_zones():
return

View file

@ -0,0 +1,7 @@
import preprocessor
import sys
p = preprocessor.geonames_allcountries("/home/armin/Downloads/allCountries/allCountries10000.txt")
with open("/home/armin/Desktop/test.dot", "w") as sys.stdout:
p.to_dot()

View file

300
timezone/search/trie.py Normal file
View file

@ -0,0 +1,300 @@
"""Radix Trie with radix 256
A Radix Trie[1] - once built - allows efficient prefix search. The trie works
on byte strings and hence is oblivious to encoding. The encoding for creation
and search must match. Payload of each node can be an arbitrary object.
Usage
-----
.. code :: python
t = Trie()
t.add("Hello", "P1")
t.add("Hi", "P2")
t.add("Hela", "P3")
t.find("He") # ["P1", "P3"]
[1] https://en.wikipedia.org/wiki/Radix_tree
"""
from __future__ import annotations
from typing import Sequence, MutableSequence, ByteString, Any, Optional
from abc import ABC, abstractmethod
import logging
log = logging.getLogger(__name__)
class Trie:
def __init__(self, multi_value=False):
self.root = Root([])
self.multi_value = multi_value
def insert(self, label: ByteString, content: Any):
log.info(f"Inserting {label} into Trie")
start = self.root.child_by_common_prefix(label)
if not start:
log.debug(f"Creating new terminal for {label} at root")
new_node = Terminal(label, content, self.root, [], self.multi_value)
self.root.put_child(new_node)
return new_node
log.debug(f"Found match {start} for {label}. Traversing down")
self._insert(start, label, content)
def _insert(self, node, label, content):
log.info(f"Inserting {label} into Trie at {node}")
if node.has_label(label):
log.debug(f"{node} equals {label}. Wrapping node as Terminal.")
if isinstance(node, Terminal) and not self.multi_value:
log.warning(f"{node} is already a Terminal. Content will be overwritten.")
terminal = Terminal.from_child(node, content, self.multi_value)
node.replace_with(terminal)
return terminal
if node.is_prefix_of(label):
log.debug(f"{node} is prefix of {label}")
cutoff = node.cut_from(label)
next_node = node.child_by_common_prefix(cutoff)
if not next_node:
log.debug(f"No matching child found for {cutoff}. Creating new child terminal.")
terminal = Terminal(cutoff, content, node, [], self.multi_value)
node.put_child(terminal)
return terminal
else:
log.debug(f"Found match {next_node} for {cutoff}. Traversing down.")
return self._insert(next_node, cutoff, content)
if node.starts_with(label):
log.debug(f"{label} is part of {node}. Creating new parent from {label}")
new_node = Terminal(label, content, node.parent, [], self.multi_value)
node.replace_with(new_node)
node.strip_prefix(label)
new_node.put_child(node)
return new_node
log.debug(f"{label} and {node} have a common ancestor")
common_prefix = node.common_prefix(label)
log.debug(f"Creating new ancestor for {common_prefix}")
ancestor = Child(common_prefix, node.parent, [])
node.replace_with(ancestor)
terminal = Terminal(cut_off_prefix(common_prefix, label), content, ancestor, [], self.multi_value)
node.strip_prefix(common_prefix)
ancestor.put_child(terminal)
ancestor.put_child(node)
return terminal
def find(self, prefix):
node = self._find(self.root, prefix)
return self._get_terminals(node, prefix)
def _find(self, node, prefix, collector=""):
cutoff = node.cut_from(prefix)
log.debug(f"Searching for {cutoff} in {node}")
child = node.child_by_prefix_match(cutoff)
if not child and not cutoff:
return node
elif not child and cutoff:
log.debug(f"Leftover cutoff {cutoff}. Trying to find node with prefix {cutoff}")
child = node.child_by_common_prefix(cutoff)
if not child or not child.starts_with(cutoff):
return None
log.debug(f"Found child {child} starting with {cutoff}")
return child
else: # child must be not None
log.debug(f"Found node {child} in {node} for {cutoff}. Traversing down.")
return self._find(child, cutoff)
def _get_terminals(self, node, label_builder):
if not node: return []
collector = []
if isinstance(node, Terminal):
collector.append((node, label_builder))
for child in node.children:
l = child.extend(label_builder)
collector.extend(self._get_terminals(child, l))
return collector
def to_dot(self) -> str:
return "graph {\n\n"+self.root.to_dot()+"\n}"
def has_common_prefix(label: ByteString, other_label: ByteString) -> bool:
""" Whether label and other_label have a prefix in common. """
assert label and other_label
return True if label[0] == other_label[0] else False
def common_prefix(label: ByteString, other_label: ByteString) -> ByteString:
""" Get the common prefix of label and other_label. """
buffer = bytearray()
for (a,b) in zip(label, other_label):
if a == b: buffer.append(a)
else: break
return buffer
def is_prefix_of(prefix: ByteString, label: ByteString) -> bool:
""" Whether label starts with prefix """
if len(prefix) > len(label):
return False
for (a,b) in zip(prefix, label):
if a != b: return False
return True
def find_first(predicate, iterable):
""" Return the first element in iterable that satisfies predicate or None """
try: return next(filter(predicate, iterable))
except StopIteration: return None
def cut_off_prefix(prefix: ByteString, label: ByteString) -> ByteString:
""" Cut prefix from start of label. Return rest of label. """
assert is_prefix_of(prefix, label)
return bytes(label[len(prefix):])
class Node(ABC):
def __init__(self, children: MutableSequence[Child]):
self.children = children
def child_by_common_prefix(self, label: ByteString) -> Optional[Child]:
""" Return Child that has a common prefix with label if one exists. """
def by_common_prefix(child: Child):
return has_common_prefix(child.label, label)
return find_first(by_common_prefix, self.children)
def child_by_prefix_match(self, label: ByteString) -> Optional[Child]:
""" Return Child which label is a prefix of the given label if one exists. """
def by_prefix_match(child: Child):
return is_prefix_of(child.label, label)
return find_first(by_prefix_match, self.children)
def put_child(self, child: Child):
""" Put child into this node's children. Replacing existing children. """
if child in self.children:
log.warning(f"Replacing child {child.label}")
self.remove_child(child)
child.parent = self
self.children.append(child)
def replace_child(self, child: Child, replacement: Child):
""" Remove child from this node's children and add replacement. """
self.remove_child(child)
self.put_child(replacement)
def remove_child(self, child: Child):
""" Remove child from this node's children """
if not child in self.children:
log.warning(f"Trying to delete {child.label} but it does not exist.")
self.children.remove(child)
@abstractmethod
def dot_label(self) -> str:
""" Readable label for this node in a dot graph """
...
@abstractmethod
def dot_id(self) -> str:
""" Technical id for this node in a dot graph. Must be unique. """
...
@abstractmethod
def cut_from(self, label: ByteString) -> ByteString:
""" Cut off node's label considered as prefix from label. """
...
def to_dot(self) -> str:
s = f'{self.dot_id()} [label="{self.dot_label()}"]\n'
for child in self.children:
s += f"{self.dot_id()} -- {child.dot_id()}\n"
s += child.to_dot()
return s
class Root(Node):
def cut_from(self, label: ByteString) -> ByteString:
return label
def dot_label(self):
return "root"
def dot_id(self):
return "root"
class Child(Node):
def __init__(self, label: ByteString, parent: Node, children: MutableSequence[Child]):
self.label = label
self.parent = parent
self.children = children
def __eq__(self, other_child):
return (isinstance(other_child, Child)
and self.label == other_child.label)
def __hash__(self):
return hash(self.label)
def __str__(self):
return self.label.decode('utf-8', 'replace').replace('"', '\\"')
def dot_label(self):
return self.label.decode('utf-8', 'replace').replace('"', '\\"')
def dot_id(self):
return id(self)
def has_label(self, label):
return self.label == label
def is_prefix_of(self, label):
return is_prefix_of(self.label, label)
def replace_with(self, new_child: Child):
new_child.parent = self.parent
self.parent.replace_child(self, new_child)
def starts_with(self, label: ByteString) -> bool:
return is_prefix_of(label, self.label)
def cut_from(self, label: ByteString) -> ByteString:
""" Cut node's label from (start of) label """
return cut_off_prefix(self.label, label)
def strip_prefix(self, prefix: ByteString):
""" Cut off prefix from node's label """
self.label = cut_off_prefix(prefix, self.label)
def extend(self, label: ByteString) -> ByteString:
""" Extend label by node's label """
return bytes(label) + bytes(self.label)
def split_label_at(self, index):
return (self.label[:index], self.label[index:])
def contains(self, label):
if len(label) > len(self.label):
return False
for (a,b) in zip(self.label, label):
if a != b: return False
return True
def common_prefix(self, label):
return common_prefix(self.label, label)
class Terminal(Child):
def __init__(self, label: ByteString, content: Any, parent: Node, children: MutableSequence[Child], multi_value: bool):
super().__init__(label, parent, children)
self.multi_value = multi_value
self.content = [content] if multi_value else content
@classmethod
def from_child(cls, child: Child, content: Any, multi_value: bool):
# multi_value param has no effect if already a Terminal. I.e.
# from_child cannot change the multi-value stage of a child that
# is already a Terminal
if isinstance(child, Terminal) and child.multi_value:
# Create a new Terminal instance. Although not needed this is what is expected
# and compatible to the non-multi-value behaviour.
t = cls(child.label, content, child.parent, child.children, child.multi_value)
t.content.extend(child.content) # add back original content
return t
return cls(child.label, content, child.parent, child.children, multi_value)
def to_dot(self) -> str:
s = super().to_dot()
s += f"{self.dot_id()} [color=blue]\n"
return s

29
timezone/timezone.py Normal file
View file

@ -0,0 +1,29 @@
import csv
from search.trie import Trie
def load_geonames():
t = Trie(multi_value=True)
with open("data/cities500.txt", "r") as f:
reader = csv.reader(f, delimiter='\t')
for i, row in enumerate(reader):
try:
t.insert(row[1].encode("utf-8"), row[17])
except Exception:
print(f"Error in row {i}")
print(f"Label: '{row[1]}'")
print(f"Type: {type(row[1])}")
raise
return t
def check_geonames():
with open("data/cities500.txt", "r") as f:
reader = csv.reader(f, delimiter='\t')
for i, row in enumerate(reader):
try:
if row[1].endswith("lea"):
print(f"{i}: {row[1]} \t\t\t {row[17]}")
except Exception:
print(f"Error in row {i}")
print(f"Label: '{row[1]}'")
print(f"Type: {type(row[1])}")
raise