Coverage for src/util.py: 83%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

"""

A collection of utility functions and application wide constants.

"""

from __future__ import division

import base64

import cgi

import json

import os

import re

import urlparse

import uuid

import zlib

import tornado.gen

import tornado.locale

import tornado.template

from tornado import httpclient

from ua_parser import user_agent_parser

import pnrconfig

from ops_logging.logger import get_logger

from redis import Redis

config_files = ['config/default.ini',

'config/override.ini'] + pnrconfig.CONFIG_FILES

config = pnrconfig.PnrConfig(file_names=config_files)

log = get_logger('util')

# NOTE : Please do not change the order of page_request_accept_headers array

# If required, add more headers to the end of the array

PAGE_REQUEST_ACCEPT_HEADERS = [

'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'text/html, application/xhtml+xml, */*',

'text/html, application/xhtml+xml, image/jxr, */*',

'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'

]

IE_COMPATIBILITY_ACCEPT_HEADERS = [

# Windows7 + IE11

'image/jpeg, application/x-ms-application, image/gif, '

'application/xaml+xml, image/pjpeg, application/x-ms-xbap',

# Windows10 + IE11

'image/gif, image/jpeg, image/pjpeg, application/x-ms-application, '

'application/xaml+xml, application/x-ms-xbap'

]

IGNORE_CONTENT_TYPE = ['font/woff',

'application/x-font-woff',

'application/font-woff',

'image/x-icon']

# Local pages

BLOCKED_ERROR_PAGE = '403_forbidden.html'

BLOCKED_DOWNLOAD_PAGE = '403_forbidden_download.html'

BLOCKED_DOWNLOAD_SIZE_EXCEEDED_PAGE = (

'403_forbidden_download_size_exceeded.html')

BLOCKED_UNSUPPORTED_BROWSER_PAGE = '403_forbidden_browser.html'

FILE_DOWNLOAD_PAGE = 'file-download-frame.html'

SERVICE_ERROR_PAGE = 'service_error.html'

JSON_HEADER = {'Content-type': 'application/json'}

TRIDENT_VERSION_PATTERN = re.compile(r'trident\/([\w\.]+)', re.IGNORECASE)

LOCAL_IP = config.get('networking', 'hostname')

if config.getboolean('pnr_squid', 'local_safeview'):

SAFEVIEW_HOST = config.get('service', 'login_host')

else:

# Example: master-xxx.menlosecurity.com (no cookies)

SAFEVIEW_HOST = (config.get('networking', 'safeview_domain_hostname') or

config.get('networking', 'safeview_hostname'))

SAFEVIEW_BLOCK_PATH = config.get('icap_server', 'safeview_block_url')

MAXIMUM_FILE_SIZE = config.getint('icap_server', 'max_file_download_size')

TRANSFER_BUFFER_SIZE = config.getint('icap_server', 'fs_buffer_size')

FILE_SERVER_RETRIES = config.getint('file_server', 'locator_retries')

FILE_SERVER_POLL_INTERVAL = (config.getint('file_server', 'poll_interval_ms') /

1000.0)

VERIFY_SAFEVIEW_CERT = config.getboolean('networking',

'verify_safeview_server_cert')

WHITELIST_ENABLED = config.getboolean('pnr_squid',

'icap_whitelist_mod_enabled')

IS_ON_PREM = (config.get('system_settings', 'deployment') == 'on_prem')

# URLs for external service's APIs

CLUSTER_HOST_URL = (

'https://%s/safeview-director/cluster_host' % SAFEVIEW_HOST)

PE_REQUEST_URL = config.get('policy_enforcement_server', 'request_url')

PE_RESPONSE_URL = config.get('policy_enforcement_server', 'response_url')

# Url for icap-server to initiate protocol with a file-server

FILE_SERVER_INIT_URL = (

'https://%s/safeview-fileserv-routing/icap_file_request?attempt=%%s'

% SAFEVIEW_HOST)

# Incomplete url for icap-server to post file to a specific file-server

FILE_SERVER_POST_URL = (

'https://%s/safeview-fileserv/icap_file_transfer?cid=%%s' % SAFEVIEW_HOST)

# Incomplete file-server url for platform internal status requests

FILE_SERVER_INT_STATUS_URL = (

'https://%s/safeview-fileserv/icap_status/%%s/?cid=%%s' % SAFEVIEW_HOST)

# Incomplete file-server url for platform internal transfers

FILE_SERVER_INT_DL_URL = (

'https://%s/safeview-fileserv/icap_retrieval/%%s/?cid=%%s' % SAFEVIEW_HOST)

# Incomplete file-server url for external client to get the status iframe from

# the file-server. a is file_id, b is single_use_code

FILE_SERVER_IFRAME_URL = (

'https://%s/safeview-fileserv/dl_status?cid=%s&a=%s&b=%s')

# Shared secret between icap and file servers until icap can use internal

# HAProxy routes

FILE_SERVER_API_SECRET = config.get('icap_server', 'icap_fs_secret')

# Timeouts for connections to external services

SAFEVIEW_TIMEOUT = (config.getint('safeview', 'timeout_ms') / 1000.0)

SAFEVIEW_RETRY_TIMEOUT = (config.getint('safeview', 'retry_timeout_ms') /

1000.0)

SAFEVIEW_RETRY_ATTEMPTS = config.getint('safeview', 'retry_attempts')

PE_TIMEOUT = (config.getint('policy_enforcement_server', 'timeout_ms') /

1000.0)

NATIVE_DOWNLOAD_TIMEOUT = config.getint('icap_server',

'native_processing_timeout')

FILESERVER_TIMEOUT = (config.getint('file_server', 'timeout_ms') /

1000.0)

FILESERVER_TRANSFER_TIMEOUT = (config.getint('file_server',

'transfer_timeout_ms') / 1000.0)

fpath = config.get('icap_server', 'default_supported_browsers_path')

DEFAULT_SUPPORTED_BROWSERS = json.loads(open(fpath).read())

fpath = config.get('icap_server', 'skip_xhr_uris_path')

SKIP_XHR_URIS = json.loads(open(fpath).read())

# Load all page templates

TEMPLATE_LOADER = tornado.template.Loader(

os.path.join(os.path.dirname(__file__), os.pardir, "templates"))

# Dump objects/data during exceptions

DUMP_ON_EXCEPTION = config.getboolean('icap_server', 'dump_on_exception')

DUMP_DIR = config.get('icap_server', 'dump_dir')

PNR_ENFORCEMENT_REDIS_LOGGING_CHANNEL = config.get('policy_enforcement_server',

'redis_logging_channel')

STRICT_RESOURCE_MODE_LOGGING_ENABLED = config.getboolean(

'icap_server', 'strict_resource_mode_logging_enabled')

redis_connection = None

def get_redis_connection():

global redis_connection

if redis_connection:

return redis_connection

try:

redis_connection = Redis(password=config.get('redis', 'password'),

socket_timeout=config.getfloat(

'redis', 'socket_timeout'),

socket_connect_timeout=config.getfloat(

'redis', 'socket_connect_timeout'),

socket_keepalive=True,

retry_on_timeout=True)

except Exception as ex:

log.error({'details': ex}, event='redis-connection-unavailable')

return redis_connection

def _get_ie_engine_version(user_agent):

"""Return the IE (trident) engine version from the user agent."""

match = TRIDENT_VERSION_PATTERN.search(user_agent)

if match:

return match.groups()[0].split('.')[0]

return None

def add_browser_details(_dict, user_agent):

"""Add the browser family/version from the user agent to the supplied dict.

"""

_dict['browser'] = ''

_dict['browser_version'] = None

if not user_agent:

return

parsed_user_agent = user_agent_parser.ParseUserAgent(user_agent)

if not parsed_user_agent:

return

_dict['browser'] = parsed_user_agent.get('family', '')

_dict['browser_version'] = parsed_user_agent.get('major', None)

# Check for IE compatibility mode

if _dict.get('browser') == 'IE' and 'Trident' in user_agent:

trident_version = _get_ie_engine_version(user_agent)

if trident_version:

_dict['browser_version'] = str(int(trident_version) + 4)

def add_headers(_dict, headers):

"""Unpack the supplied pyicap headers into _dict.

pyicap collates the headers into header dicts, in the form key=[val1, val2],

so this function will unpack the first entry of the list into _dict, in the

form _dict[key]=val1.

"""

for key, value in headers.iteritems():

if key and value:

if not value:

continue

elif len(value) > 1:

# Druid cannot handle multiple values. A new druid schema may be

# required if we see a lot of this error.

log.debug({'header': key,

'len': len(value),

'value': value},

event='multiple_value_headers_error')

_dict[key.lower()] = value[0]

def add_request_data(_dict, http_request):

"""Add the request data from the http_request object into _dict."""

url = http_request.uri # Need a local copy so it can be modified

_dict['request_type'] = http_request.method

_dict['url'] = url

_dict['http_version'] = http_request.http_version

if '//' not in url:

url = '//' + url

url_parts = urlparse.urlsplit(url)

_dict['domain'] = url_parts.hostname

_dict['url_path'] = url_parts.path

_dict['url_parts'] = url_parts

def get_user_data(http_request, icap_request):

"""Return the user data from the http_request/icap_request object.

The required information is extracted from various header fields.

Return is a tuple of (tid, uid), where tid is an int.

If user data is unknown, responds with (-1, "Unknown").

"""

tid = -1

uid = 'Unknown'

try:

# Prefer x-authenticated-user and x-msip-tenant-id but if not set use

# x-icap-userdata, and if that's not set either, fall back to the

# default of tenant '-1' and user 'Unknown'

user_id = http_request.get_header('x-authenticated-user')

tenant_id = http_request.get_header('x-msip-tenant-id')

if not user_id:

userdata = http_request.get_header('x-icap-userdata')

if userdata and ':' in userdata:

tid, uid = userdata.split(':', 1)

# If tid is still -1, try to get the TID from the Client IP

# this is from the icap request header

if int(tid) == -1:

toks = icap_request.get_header('x-icap-req-cache', '').split(':')

if len(toks) > 1:

tid = int(toks[0])

tid = tenant_id or tid

uid = user_id or uid

except Exception as e:

log.exception({'error': str(e),

'error_type': type(e).__name__},

event='cant-get-user-tenant')

return (int(tid), uid)

def populate_icap_req_cache_if_required(_dict, icap_request, icap_response):

req_cache_key = icap_request.get_header('x-icap-req-cache')

if req_cache_key:

return

cache_id = uuid.uuid4().hex

_dict['x-icap-req-cache'] = cache_id

# The icap response headers will be used by squid to tie the

# request and response

icap_response.modify_header('x-icap-req-cache', cache_id)

# If squid whitelist module is enabled (for e.g. cloud), log an error

# since the whitelist module should have generated the cache key

if config.getboolean('pnr_squid', 'icap_whitelist_mod_enabled'):

log.error(_dict, event='icap_cache_key_not_found')

def get_file_name(http_request, http_response, default=u'download'):

"""Get the file name from the http request / response objects.

Inspects the content disposition header first, if this does not give a

potential file name, fall back to parsing the http request object's url.

If this still does not give a potential file name, use default.

"""

# Determine the file name

file_name = ''

content_disposition = http_response.get_header('content-disposition', '')

if content_disposition:

params = cgi.parse_header(content_disposition)[1]

file_name = params.get('filename', '')

if not file_name:

# No filename from content-disposition, so attempt to determine one

# from the final path segment (trailing / are stripped) or fallback

# to default.

file_name = (urlparse.urlparse(http_request.uri).path

.strip('/').split('/').pop() or default)

# Filename could be raw bytes, or url quoted. Therefore ensure it is

# decoded to a unicode object.

try:

unicode_filename = tornado.escape.url_unescape(file_name)

except UnicodeError:

# Failed to decode using UTF8 codec, so fall back to latin-1

try:

unicode_filename = tornado.escape.url_unescape(file_name, 'latin-1')

except UnicodeError as e:

# Highly unexpected error decoding, as latin-1 should always decode.

# Log an error and fall back to 'default' filename.

unicode_filename = default

log.error({'file_name': file_name, 'type': type(file_name),

'error': e},

event='filename-decode-failure')

# Further sanitize the filename - removing any slashes and odd cases

unicode_filename = unicode_filename.replace('/', '-')

if unicode_filename in ('', '.', '..', '/', '-'):

unicode_filename = default

return unicode_filename

def compress_and_encode(content_encoding, preview_data):

"""Gzip compress and base64 encode the preview_data to send to pnr-e."""

if not preview_data:

return ''

if 'gzip' not in content_encoding:

preview_data = zlib.compress(preview_data)

return base64.b64encode(preview_data)

@tornado.gen.coroutine

def prepare_block_page(block_info, local_block_page,

accept_language='en',

mod_type='request'):

"""Prepare and return block page contents

Get block page from safeview if present, if not then get the

standard block page from the local file system

"""

assert all(

x in block_info for x in ['user', 'tid', 'url', 'categories', 'result'])

url_parts = ('https', SAFEVIEW_HOST, SAFEVIEW_BLOCK_PATH, '', '')

block_url = urlparse.urlunsplit(url_parts)

html_content = get_localized_page(local_block_page, accept_language)

log_dict = {

'block_url': block_url,

'mod_type': mod_type,

'accept_language': accept_language

}

try:

http_client = httpclient.AsyncHTTPClient()

response = yield http_client.fetch(

block_url,

method='POST',

headers=JSON_HEADER,

body=json.dumps(block_info),

request_timeout=SAFEVIEW_TIMEOUT,

validate_cert=VERIFY_SAFEVIEW_CERT)

if response.code == 200:

response_data = json.loads(response.body)

# Q: Is this needed since we already have a 200 response code

if 'html' in response_data:

html_content = response_data['html'].encode('utf-8')

except httpclient.HTTPError as ex:

log_dict['details'] = ex

log.error(log_dict, event='prepare-block-page')

except Exception as ex:

log_dict['details'] = ex

log.exception(log_dict, event='prepare-block-page')

raise tornado.gen.Return(html_content)

def get_browser_locale(accept_language_header, default='en'):

"""Determine the browser locale from the Accept-language header.

Returns a tornado.locale.Locale object using the languages in the

accept_language_header, before defaulting to 'default' if a locale

cannot be found.

This function is based on tornado's web.RequestHandler.get_browser_locale.

"""

# Check translation setting is enabled and accept_language is not empty

if (config.getboolean('service', 'enable_translations') and

accept_language_header):

languages = accept_language_header.split(",")

locales = []

for language in languages:

parts = language.strip().split(";")

if len(parts) > 1 and parts[1].startswith("q="):

try:

score = float(parts[1][2:])

except (ValueError, TypeError):

score = 0.0

else:

score = 1.0

locales.append((parts[0], score))

if locales:

locales.sort(key=lambda pair: pair[1], reverse=True)

codes = [l[0] for l in locales]

return tornado.locale.get(*codes)

return tornado.locale.get(default)

def get_localized_page(page_name, accept_language_header, **kwargs):

"""Create a localized version of the local page_name template.

Uses the accept_language_header to determine an appropriate language.

Page_name should be the name of the template file, preferably using the

constants defined in this file.

"""

html_content = ''

try:

template = TEMPLATE_LOADER.load(page_name)

locale = get_browser_locale(accept_language_header)

# The _() function is not provided automatically when invoking generate

# directly, so add it as done by tornado's render_string method.

html_content = template.generate(_=locale.translate, **kwargs)

except Exception as e:

log.exception({'error': str(e),

'error_type': type(e).__name__,

'page': page_name,

'accept-language': accept_language_header},

event='error-generating-local-page')

return html_content

Coverage for src/util.py : 83%

217 statements 181 run 36 missing 0 excluded