#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Frederik Elwert <frederik.elwert@web.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
"""
This moduls provides a base implementation of a TCF compatible web service.
"""
import sys
import argparse
import logging
import requests
from lxml import etree
from tcflib import tcf
[docs]class Worker(object):
"""
A :class:`Worker` is responsible for running a single transformation.
This is the base class.
Input data are passed to a :class:`Worker` instance during
initialization. The :class:`Worker` class implements a :meth:`run`
method that returns the output data.
For efficiency reasons, a worker can get either a byte string or a
:class:`tcf.TextCorpus` as input. This allows to pass
:class:`tcf.TextCorpus` objects around without serializing them. If a byte
string serialization is required, use :func:`tcf.serialize`.
"""
__options__ = {}
layers = None
def __init__(self, **options):
"""Initialize a Worker instance with a set of options."""
logging.debug('Init worker {}.'.format(type(self).__name__))
self.options = argparse.Namespace()
vars(self.options).update(self.__options__)
if options:
vars(self.options).update(options)
logging.debug('Using options: {}'.format(self.options))
else:
logging.debug('Using default options: {}'.format(self.options))
def __ror__(self, input_data):
return self.run(input_data)
[docs] def setup(self, input_data):
"""Read input_data and parse them into a :class:`tcf.TextCorpus`."""
if isinstance(input_data, tcf.TextCorpus):
self.corpus = input_data
else:
self.corpus = tcf.TextCorpus(input_data, layers=self.layers)
[docs] def run(self, input_data):
"""
This method is called to perform the actual data transformation.
Subclasses must override this method.
"""
pass
[docs]class AddingWorker(Worker):
"""
An :class:`AddingWorker` adds annotations to the input data.
"""
[docs] def run(self, input_data):
"""
Parse input data and run annotation.
Subclasses usually do not override this method, but
:meth:`add_annotations`.
"""
self.setup(input_data)
self.add_annotations()
return self.corpus
[docs] def add_annotations(self):
"""Subclasses usually override this method."""
pass
[docs]class ImportingWorker(Worker):
"""
An :class:`ImportingWorker` converts input data to TCF.
"""
[docs] def setup(self, input_data):
self.input_data = input_data
[docs] def run(self, input_data):
"""
Parse input data and run annotation.
Subclasses usually do not override this method, but
:meth:`import_`.
"""
self.setup(input_data)
return self.import_()
[docs] def import_(self):
"""Subclasses usually override this method."""
pass
[docs]class ExportingWorker(Worker):
"""
A :class:`ExportingWorker` converts TCF data into other formats.
"""
[docs] def run(self, input_data):
"""
Parse input data and run annotation.
Subclasses usually do not override this method, but
:meth:`export`.
"""
self.setup(input_data)
return self.export()
[docs] def export(self):
"""Subclasses usually override this method."""
raise NotImplementedError
[docs]class RemoteWorker(Worker):
"""
A :class:`RemoteWorker` defers the actual work to a web service.
This class can either be instantiated directly, passing the `url`
parameter to its constructor, or it can be subclassed, setting the `url`
class variable.
"""
#: The URL of the remote service.
url = ''
def __init__(self, **options):
"""
:param url: The URL of the web service that should be called.
"""
if 'url' in options:
self.url = options['url']
del options['url']
super().__init__(**options)
[docs] def run(self, input_data):
"""Pass input_data to a remote service."""
input_data = tcf.serialize(input_data)
response = requests.post(self.url, params=vars(self.options),
data=input_data)
return response.content
[docs]class Write(object):
"""
A dummy worker that writes its input into a file.
It returns the input unchanged, so it can be used to write out intermediate
results in a chain.
"""
def __init__(self, filename):
self.filename = filename
def __ror__(self, input_data):
if input_data:
if isinstance(input_data, tcf.TextCorpus):
input_data.write(self.filename)
elif hasattr(input_data, 'xpath'):
input_data.write(self.filename, encoding='utf8',
pretty_print=True,
xml_declaration=True)
else:
with open(self.filename, 'wb') as outfile:
outfile.write(tcf.serialize(input_data))
return input_data
[docs]def Read(filename):
"""A dummy worker that reads input from a file."""
with open(filename, 'rb') as infile:
return infile.read()
[docs]def get_arg_parser(worker_class=None):
"""Create an ArgumentParser with default options."""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-v', '--verbose', action='store_true')
service_group = arg_parser.add_argument_group('Web service',
'Run as web service')
service_group.add_argument('-s', '--service', action='store_true')
service_group.add_argument('-p', '--port', type=int, default=8080)
cli_group = arg_parser.add_argument_group('Command line',
'Run as command line program')
cli_group.add_argument('-i', '--infile', default=sys.stdin.buffer,
type=argparse.FileType('rb'))
cli_group.add_argument('-o', '--outfile', default=sys.stdout.buffer,
type=argparse.FileType('wb'))
if worker_class:
for key, value in worker_class.__options__.items():
if isinstance(value, list):
vtype = type(value[0])
cli_group.add_argument('--' + key, default=value, type=vtype,
nargs='*')
else:
cli_group.add_argument('--' + key, default=value,
type=type(value))
return arg_parser
[docs]def run_as_cli(worker_class):
"""
Run a worker from the commandline.
In order for a worker to be called from the command line, the module
defining the worker should call :func:`run_as_cli` when run standalone.
"""
# Parse commandline arguments
arg_parser = get_arg_parser(worker_class)
args = arg_parser.parse_args()
# Find extra options that should be passed to worker
worker_args = vars(args).copy()
for key in list(worker_args.keys()):
if not key in worker_class.__options__:
del worker_args[key]
# Set up logging
if args.verbose:
level = logging.DEBUG
logging.captureWarnings(True)
else:
level = logging.ERROR
logging.basicConfig(level=level)
# Run as service or cli program
if args.service:
run_as_service(worker_class, port=args.port)
else:
# Run transformation
input_data = args.infile.read()
worker = worker_class(**worker_args)
output = worker.run(input_data)
output = tcf.serialize(output)
if output:
args.outfile.write(output)
[docs]def run_as_service(worker_class, port):
"""Run a worker as a web service."""
from bottle import request, route, run
@route('/annotate', method='POST')
def annotate():
logging.debug('Got HTTP request.')
worker = worker_class(**request.query)
output = worker.run(request.body.read())
output = tcf.serialize(output)
return output
run(host='localhost', port=port)