Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

""" 

A collection of utility functions and application wide constants. 

""" 

from __future__ import division 

import base64 

import cgi 

import json 

import os 

import re 

import urlparse 

import uuid 

import zlib 

 

import tornado.gen 

import tornado.locale 

import tornado.template 

from tornado import httpclient 

from ua_parser import user_agent_parser 

 

import pnrconfig 

from ops_logging.logger import get_logger 

from redis import Redis 

 

 

config_files = ['config/default.ini', 

'config/override.ini'] + pnrconfig.CONFIG_FILES 

config = pnrconfig.PnrConfig(file_names=config_files) 

log = get_logger('util') 

 

# NOTE : Please do not change the order of page_request_accept_headers array 

# If required, add more headers to the end of the array 

PAGE_REQUEST_ACCEPT_HEADERS = [ 

'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 

'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 

'text/html, application/xhtml+xml, */*', 

'text/html, application/xhtml+xml, image/jxr, */*', 

'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' 

] 

IE_COMPATIBILITY_ACCEPT_HEADERS = [ 

# Windows7 + IE11 

'image/jpeg, application/x-ms-application, image/gif, ' 

'application/xaml+xml, image/pjpeg, application/x-ms-xbap', 

# Windows10 + IE11 

'image/gif, image/jpeg, image/pjpeg, application/x-ms-application, ' 

'application/xaml+xml, application/x-ms-xbap' 

] 

IGNORE_CONTENT_TYPE = ['font/woff', 

'application/x-font-woff', 

'application/font-woff', 

'image/x-icon'] 

 

# Local pages 

BLOCKED_ERROR_PAGE = '403_forbidden.html' 

BLOCKED_DOWNLOAD_PAGE = '403_forbidden_download.html' 

BLOCKED_DOWNLOAD_SIZE_EXCEEDED_PAGE = ( 

'403_forbidden_download_size_exceeded.html') 

BLOCKED_UNSUPPORTED_BROWSER_PAGE = '403_forbidden_browser.html' 

FILE_DOWNLOAD_PAGE = 'file-download-frame.html' 

SERVICE_ERROR_PAGE = 'service_error.html' 

 

JSON_HEADER = {'Content-type': 'application/json'} 

TRIDENT_VERSION_PATTERN = re.compile(r'trident\/([\w\.]+)', re.IGNORECASE) 

 

LOCAL_IP = config.get('networking', 'hostname') 

 

if config.getboolean('pnr_squid', 'local_safeview'): 

SAFEVIEW_HOST = config.get('service', 'login_host') 

else: 

# Example: master-xxx.menlosecurity.com (no cookies) 

SAFEVIEW_HOST = (config.get('networking', 'safeview_domain_hostname') or 

config.get('networking', 'safeview_hostname')) 

 

SAFEVIEW_BLOCK_PATH = config.get('icap_server', 'safeview_block_url') 

MAXIMUM_FILE_SIZE = config.getint('icap_server', 'max_file_download_size') 

TRANSFER_BUFFER_SIZE = config.getint('icap_server', 'fs_buffer_size') 

FILE_SERVER_RETRIES = config.getint('file_server', 'locator_retries') 

FILE_SERVER_POLL_INTERVAL = (config.getint('file_server', 'poll_interval_ms') / 

1000.0) 

VERIFY_SAFEVIEW_CERT = config.getboolean('networking', 

'verify_safeview_server_cert') 

WHITELIST_ENABLED = config.getboolean('pnr_squid', 

'icap_whitelist_mod_enabled') 

IS_ON_PREM = (config.get('system_settings', 'deployment') == 'on_prem') 

 

# URLs for external service's APIs 

CLUSTER_HOST_URL = ( 

'https://%s/safeview-director/cluster_host' % SAFEVIEW_HOST) 

PE_REQUEST_URL = config.get('policy_enforcement_server', 'request_url') 

PE_RESPONSE_URL = config.get('policy_enforcement_server', 'response_url') 

 

# Url for icap-server to initiate protocol with a file-server 

FILE_SERVER_INIT_URL = ( 

'https://%s/safeview-fileserv-routing/icap_file_request?attempt=%%s' 

% SAFEVIEW_HOST) 

# Incomplete url for icap-server to post file to a specific file-server 

FILE_SERVER_POST_URL = ( 

'https://%s/safeview-fileserv/icap_file_transfer?cid=%%s' % SAFEVIEW_HOST) 

 

# Incomplete file-server url for platform internal status requests 

FILE_SERVER_INT_STATUS_URL = ( 

'https://%s/safeview-fileserv/icap_status/%%s/?cid=%%s' % SAFEVIEW_HOST) 

# Incomplete file-server url for platform internal transfers 

FILE_SERVER_INT_DL_URL = ( 

'https://%s/safeview-fileserv/icap_retrieval/%%s/?cid=%%s' % SAFEVIEW_HOST) 

 

# Incomplete file-server url for external client to get the status iframe from 

# the file-server. a is file_id, b is single_use_code 

FILE_SERVER_IFRAME_URL = ( 

'https://%s/safeview-fileserv/dl_status?cid=%s&a=%s&b=%s') 

 

# Shared secret between icap and file servers until icap can use internal 

# HAProxy routes 

FILE_SERVER_API_SECRET = config.get('icap_server', 'icap_fs_secret') 

 

# Timeouts for connections to external services 

SAFEVIEW_TIMEOUT = (config.getint('safeview', 'timeout_ms') / 1000.0) 

SAFEVIEW_RETRY_TIMEOUT = (config.getint('safeview', 'retry_timeout_ms') / 

1000.0) 

SAFEVIEW_RETRY_ATTEMPTS = config.getint('safeview', 'retry_attempts') 

PE_TIMEOUT = (config.getint('policy_enforcement_server', 'timeout_ms') / 

1000.0) 

NATIVE_DOWNLOAD_TIMEOUT = config.getint('icap_server', 

'native_processing_timeout') 

FILESERVER_TIMEOUT = (config.getint('file_server', 'timeout_ms') / 

1000.0) 

FILESERVER_TRANSFER_TIMEOUT = (config.getint('file_server', 

'transfer_timeout_ms') / 1000.0) 

 

fpath = config.get('icap_server', 'default_supported_browsers_path') 

DEFAULT_SUPPORTED_BROWSERS = json.loads(open(fpath).read()) 

fpath = config.get('icap_server', 'skip_xhr_uris_path') 

SKIP_XHR_URIS = json.loads(open(fpath).read()) 

 

# Load all page templates 

TEMPLATE_LOADER = tornado.template.Loader( 

os.path.join(os.path.dirname(__file__), os.pardir, "templates")) 

 

# Dump objects/data during exceptions 

DUMP_ON_EXCEPTION = config.getboolean('icap_server', 'dump_on_exception') 

DUMP_DIR = config.get('icap_server', 'dump_dir') 

PNR_ENFORCEMENT_REDIS_LOGGING_CHANNEL = config.get('policy_enforcement_server', 

'redis_logging_channel') 

STRICT_RESOURCE_MODE_LOGGING_ENABLED = config.getboolean( 

'icap_server', 'strict_resource_mode_logging_enabled') 

 

redis_connection = None 

 

 

def get_redis_connection(): 

global redis_connection 

if redis_connection: 

return redis_connection 

try: 

redis_connection = Redis(password=config.get('redis', 'password'), 

socket_timeout=config.getfloat( 

'redis', 'socket_timeout'), 

socket_connect_timeout=config.getfloat( 

'redis', 'socket_connect_timeout'), 

socket_keepalive=True, 

retry_on_timeout=True) 

except Exception as ex: 

log.error({'details': ex}, event='redis-connection-unavailable') 

return redis_connection 

 

 

def _get_ie_engine_version(user_agent): 

"""Return the IE (trident) engine version from the user agent.""" 

match = TRIDENT_VERSION_PATTERN.search(user_agent) 

if match: 

return match.groups()[0].split('.')[0] 

return None 

 

 

def add_browser_details(_dict, user_agent): 

"""Add the browser family/version from the user agent to the supplied dict. 

""" 

_dict['browser'] = '' 

_dict['browser_version'] = None 

 

if not user_agent: 

return 

 

parsed_user_agent = user_agent_parser.ParseUserAgent(user_agent) 

if not parsed_user_agent: 

return 

 

_dict['browser'] = parsed_user_agent.get('family', '') 

_dict['browser_version'] = parsed_user_agent.get('major', None) 

 

# Check for IE compatibility mode 

if _dict.get('browser') == 'IE' and 'Trident' in user_agent: 

trident_version = _get_ie_engine_version(user_agent) 

if trident_version: 

_dict['browser_version'] = str(int(trident_version) + 4) 

 

 

def add_headers(_dict, headers): 

"""Unpack the supplied pyicap headers into _dict. 

 

pyicap collates the headers into header dicts, in the form key=[val1, val2], 

so this function will unpack the first entry of the list into _dict, in the 

form _dict[key]=val1. 

""" 

for key, value in headers.iteritems(): 

if key and value: 

if not value: 

continue 

elif len(value) > 1: 

# Druid cannot handle multiple values. A new druid schema may be 

# required if we see a lot of this error. 

log.debug({'header': key, 

'len': len(value), 

'value': value}, 

event='multiple_value_headers_error') 

_dict[key.lower()] = value[0] 

 

 

def add_request_data(_dict, http_request): 

"""Add the request data from the http_request object into _dict.""" 

url = http_request.uri # Need a local copy so it can be modified 

 

_dict['request_type'] = http_request.method 

_dict['url'] = url 

_dict['http_version'] = http_request.http_version 

if '//' not in url: 

url = '//' + url 

url_parts = urlparse.urlsplit(url) 

_dict['domain'] = url_parts.hostname 

_dict['url_path'] = url_parts.path 

_dict['url_parts'] = url_parts 

 

 

def get_user_data(http_request, icap_request): 

"""Return the user data from the http_request/icap_request object. 

 

The required information is extracted from various header fields. 

 

Return is a tuple of (tid, uid), where tid is an int. 

If user data is unknown, responds with (-1, "Unknown"). 

""" 

tid = -1 

uid = 'Unknown' 

try: 

# Prefer x-authenticated-user and x-msip-tenant-id but if not set use 

# x-icap-userdata, and if that's not set either, fall back to the 

# default of tenant '-1' and user 'Unknown' 

user_id = http_request.get_header('x-authenticated-user') 

tenant_id = http_request.get_header('x-msip-tenant-id') 

if not user_id: 

userdata = http_request.get_header('x-icap-userdata') 

if userdata and ':' in userdata: 

tid, uid = userdata.split(':', 1) 

# If tid is still -1, try to get the TID from the Client IP 

# this is from the icap request header 

if int(tid) == -1: 

toks = icap_request.get_header('x-icap-req-cache', '').split(':') 

if len(toks) > 1: 

tid = int(toks[0]) 

tid = tenant_id or tid 

uid = user_id or uid 

except Exception as e: 

log.exception({'error': str(e), 

'error_type': type(e).__name__}, 

event='cant-get-user-tenant') 

return (int(tid), uid) 

 

 

def populate_icap_req_cache_if_required(_dict, icap_request, icap_response): 

req_cache_key = icap_request.get_header('x-icap-req-cache') 

if req_cache_key: 

return 

cache_id = uuid.uuid4().hex 

_dict['x-icap-req-cache'] = cache_id 

# The icap response headers will be used by squid to tie the 

# request and response 

icap_response.modify_header('x-icap-req-cache', cache_id) 

# If squid whitelist module is enabled (for e.g. cloud), log an error 

# since the whitelist module should have generated the cache key 

if config.getboolean('pnr_squid', 'icap_whitelist_mod_enabled'): 

log.error(_dict, event='icap_cache_key_not_found') 

 

 

def get_file_name(http_request, http_response, default=u'download'): 

"""Get the file name from the http request / response objects. 

 

Inspects the content disposition header first, if this does not give a 

potential file name, fall back to parsing the http request object's url. 

If this still does not give a potential file name, use default. 

""" 

# Determine the file name 

file_name = '' 

content_disposition = http_response.get_header('content-disposition', '') 

if content_disposition: 

params = cgi.parse_header(content_disposition)[1] 

file_name = params.get('filename', '') 

if not file_name: 

# No filename from content-disposition, so attempt to determine one 

# from the final path segment (trailing / are stripped) or fallback 

# to default. 

file_name = (urlparse.urlparse(http_request.uri).path 

.strip('/').split('/').pop() or default) 

 

# Filename could be raw bytes, or url quoted. Therefore ensure it is 

# decoded to a unicode object. 

try: 

unicode_filename = tornado.escape.url_unescape(file_name) 

except UnicodeError: 

# Failed to decode using UTF8 codec, so fall back to latin-1 

try: 

unicode_filename = tornado.escape.url_unescape(file_name, 'latin-1') 

except UnicodeError as e: 

# Highly unexpected error decoding, as latin-1 should always decode. 

# Log an error and fall back to 'default' filename. 

unicode_filename = default 

log.error({'file_name': file_name, 'type': type(file_name), 

'error': e}, 

event='filename-decode-failure') 

 

# Further sanitize the filename - removing any slashes and odd cases 

unicode_filename = unicode_filename.replace('/', '-') 

if unicode_filename in ('', '.', '..', '/', '-'): 

unicode_filename = default 

 

return unicode_filename 

 

 

def compress_and_encode(content_encoding, preview_data): 

"""Gzip compress and base64 encode the preview_data to send to pnr-e.""" 

if not preview_data: 

return '' 

if 'gzip' not in content_encoding: 

preview_data = zlib.compress(preview_data) 

return base64.b64encode(preview_data) 

 

 

@tornado.gen.coroutine 

def prepare_block_page(block_info, local_block_page, 

accept_language='en', 

mod_type='request'): 

"""Prepare and return block page contents 

 

Get block page from safeview if present, if not then get the 

standard block page from the local file system 

""" 

assert all( 

x in block_info for x in ['user', 'tid', 'url', 'categories', 'result']) 

 

url_parts = ('https', SAFEVIEW_HOST, SAFEVIEW_BLOCK_PATH, '', '') 

block_url = urlparse.urlunsplit(url_parts) 

html_content = get_localized_page(local_block_page, accept_language) 

log_dict = { 

'block_url': block_url, 

'mod_type': mod_type, 

'accept_language': accept_language 

} 

try: 

http_client = httpclient.AsyncHTTPClient() 

response = yield http_client.fetch( 

block_url, 

method='POST', 

headers=JSON_HEADER, 

body=json.dumps(block_info), 

request_timeout=SAFEVIEW_TIMEOUT, 

validate_cert=VERIFY_SAFEVIEW_CERT) 

if response.code == 200: 

response_data = json.loads(response.body) 

# Q: Is this needed since we already have a 200 response code 

if 'html' in response_data: 

html_content = response_data['html'].encode('utf-8') 

except httpclient.HTTPError as ex: 

log_dict['details'] = ex 

log.error(log_dict, event='prepare-block-page') 

except Exception as ex: 

log_dict['details'] = ex 

log.exception(log_dict, event='prepare-block-page') 

raise tornado.gen.Return(html_content) 

 

 

def get_browser_locale(accept_language_header, default='en'): 

"""Determine the browser locale from the Accept-language header. 

 

Returns a tornado.locale.Locale object using the languages in the 

accept_language_header, before defaulting to 'default' if a locale 

cannot be found. 

 

This function is based on tornado's web.RequestHandler.get_browser_locale. 

""" 

# Check translation setting is enabled and accept_language is not empty 

if (config.getboolean('service', 'enable_translations') and 

accept_language_header): 

languages = accept_language_header.split(",") 

locales = [] 

for language in languages: 

parts = language.strip().split(";") 

if len(parts) > 1 and parts[1].startswith("q="): 

try: 

score = float(parts[1][2:]) 

except (ValueError, TypeError): 

score = 0.0 

else: 

score = 1.0 

locales.append((parts[0], score)) 

if locales: 

locales.sort(key=lambda pair: pair[1], reverse=True) 

codes = [l[0] for l in locales] 

return tornado.locale.get(*codes) 

return tornado.locale.get(default) 

 

 

def get_localized_page(page_name, accept_language_header, **kwargs): 

"""Create a localized version of the local page_name template. 

 

Uses the accept_language_header to determine an appropriate language. 

Page_name should be the name of the template file, preferably using the 

constants defined in this file. 

""" 

html_content = '' 

 

try: 

template = TEMPLATE_LOADER.load(page_name) 

locale = get_browser_locale(accept_language_header) 

 

# The _() function is not provided automatically when invoking generate 

# directly, so add it as done by tornado's render_string method. 

html_content = template.generate(_=locale.translate, **kwargs) 

except Exception as e: 

log.exception({'error': str(e), 

'error_type': type(e).__name__, 

'page': page_name, 

'accept-language': accept_language_header}, 

event='error-generating-local-page') 

 

return html_content