Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

############################################################################# 

# 

# Represents a single sandboxed Surrogate. 

# 

# Copyright (C) 2015-17 Menlo Security, Inc. 

# All rights reserved. 

# 

############################################################################## 

import functools 

import json 

import os 

import resource 

import shutil 

import socket 

import time 

import urlparse 

 

import tornado.concurrent 

import tornado.curl_httpclient 

import tornado.gen 

import tornado.httpclient 

import tornado.ioloop 

import tornado.iostream 

from Crypto import Random as rand 

 

import net 

import safly.config 

import safly.misc 

from safly.config import config, local_ip_v4 

from safly.container import os_helpers 

from safly.container.container import XvfbContainer, ContainerMounts, BindMount 

from safly.container.priv import CAP_SYS_CHROOT, CAP_SYS_ADMIN 

from safly.event_reporter import EventReporter 

from safly.timex import monotonic 

from pnr_observer import PnrObserver 

 

# BrowserStateServer is assumed to be running in the same node and network 

# namespace as us -- there's one for each SurrogateManager. 

STATE_SERVER_URL = 'http://localhost:%d' % (config.getint('cookie-server', 

'port')) 

 

if not hasattr(resource, 'RLIMIT_NICE'): 

resource.RLIMIT_NICE = 13 # Linux specific. 

 

class ChromiumMounts(ContainerMounts): 

"""A minimalistic set of mounts, just enough to support running the 

Chromium backend.""" 

def __init__(self, *args, **kwargs): 

super(ChromiumMounts, self).__init__(*args, **kwargs) 

sv_cr_path = os.path.dirname(config.get('service', 'sv_cr_path')) 

# The config item holds a directory, so there's no need to mount its 

# parent directory (which could coincide with the dirname for 

# 'service.sv_cr_path' (see below)). 

sv_cr_monitor_path = config.get('service', 'sv_cr_monitor_path') 

# Setuid execution should be enabled only for the directory in which 

# the chrome_sandbox resides. Nothing else can be trusted with that 

# bit due to our relaxed capability bounding set. 

# 

# Note: in the default configuration, 'service.sv_cr_monitor_path' is a 

# subdirectory of the variable sv_cr_path 

# (which is os.path.dirname('service', 'sv_cr_path')), so in effect the 

# "monitor" path would in that case have the extra permissions granted by 

# the BindMount instantiation (below). The explicit BindMount 

# instantiation gives the right access to the "monitor", regardless of 

# its (relative) location (to 'service.sv_cr_path'). 

for mount in self._mounts: 

if 'nosuid' not in mount.mount_flags: 

raise Exception('Insecure mount option found: setuid enabled') 

# Must allow suid execution for chrome_sandbox. 

self._mounts.extend([BindMount(sv_cr_path, sv_cr_path, allow_exec=True, 

allow_suid=True), 

BindMount(sv_cr_monitor_path, sv_cr_monitor_path), 

# Fontconfig files: without this Surrogate will 

# issue 'Fontconfig error: Cannot load default 

# config file'. 

BindMount('/etc/fonts', '/etc/fonts'), 

# Optimization: so that font cache needn't be 

# regenerated on each Surrogate. 

BindMount('/var/cache/fontconfig', 

'/var/cache/fontconfig')]) 

 

 

def hosts_to_resolv_conf(hosts): 

"""Converts a list of hosts to resolv.conf format.""" 

return '\n'.join(['nameserver %s' % h for h in hosts]) 

 

class SurrogateContainer(XvfbContainer): 

SECRET_RID_LEN = 16 

# Surrogate states: transitions must occur from lower states to higher 

# ones. 

STATE_STARTED, STATE_READY, STATE_AWAITING_COUPLING, STATE_COUPLED,\ 

STATE_DEAD = range(5) 

# Surrogate must transition to a higher state before the timeout 

# for the current state expires. Zero indicates infinite timeout (i.e., 

# it may linger there forever). 

TRANSITION_TIMEOUTS = {STATE_STARTED: config.getint('surrogate_manager', 

'tt_started_secs'), 

STATE_READY: 0, 

STATE_AWAITING_COUPLING: config.getint('surrogate_manager', 

'tt_awaiting_coupling_secs'), 

STATE_COUPLED: 0, 

STATE_DEAD: 0} 

SURROGATE_HTTPD_PORT = config.getint('surrogate', 

'http_server_port') 

CONTROL_SOCKET_NAME = 'sm.sock' 

# Frequency at which to check for dead Surrogates and reap them. 

REAP_INTERVAL_MS = 1000 

# We log stats if the following metrics change by the following 

# thresholds. The key names must match the keys in surrogate.get_stats(). 

# 

# 'total_' prefix indicates that stat includes current group and all 

# descendant cgroups. 

THRESHOLD_BY_STAT = config.getjson('surrogate_manager', 

'stats_logging_thresholds') 

# Minimum rate at which we print stats. It may be printed at higher rates 

# due to significant resource changes (as defined by thresholds above), 

# but this rate is guaranteed. 

STATS_LOGGING_INTERVAL_MS = config.getint('surrogate_manager', 

'stats_logging_interval_ms') 

# To support dynamic DNS configuration (e.g., on a per gateway-device 

# basis), we have all Surrogates use a fixed set of private IPs for DNS 

# requests, and then we use IPTables packet-mangling (DNAT) to route the 

# packets to the real DNS servers. The alternative approach of making the 

# Surrogate aware of DNS changes proved susceptible to races---the 

# Chromium public API offered no clean way to ensure that DNS changes 

# took effect. 

SURROGATE_DNS_HOSTS = ['192.168.3.1', '192.168.3.2'] 

# Proxy that Surrogate should use. This is a virtual IP that is DNAT'd on 

# the host to the actual proxy IP address (which could be dynamic). 

SURROGATE_PROXY_IP = '192.168.4.1' 

 

# Directory of software routers. Exposed to facilitate mocking. 

ROUTER_KLASS_BY_NAME = {'gateway': net.GatewayRouter, 

'firewall': net.HostInputFirewall} 

 

# HTTP requests made to external services should abide by these 

# timeouts. The format here is (connect_timeout, total_request_timeout). 

# 

# Justification: 

# 

# State sync may take a lot of time if there is a lot of state. 

REQ_TIMEOUT_BY_KIND = {'state-sync': (10, 600)} 

 

def __init__(self, ns_pool_man, state_change_cb=None, io_loop=None, 

surrogate_log_fd=None): 

self._io_loop = io_loop or tornado.ioloop.IOLoop.instance() 

self._state_change_cb = state_change_cb 

# Use the full qualified name for node_id so nosetest can mock it. 

self.s_nid = safly.config.node_id 

self.secret_rid = rand.get_random_bytes(self.SECRET_RID_LEN).encode('hex') 

self.safe_rid = safly.misc.sanitize(self.secret_rid) 

self.http_host = local_ip_v4 

self.state = None 

self._ctrl_sock = None 

self._ctrl_stream = None 

self._transition_timeout = None 

# These periodic timers get cleaned up automatically on container 

# release. 

self._periodic_clocks = {} 

self._routers = {} 

self.safe_bid = None 

self._version = None 

self._cr_version = None 

self._flash_version = None 

self._allow_container_debugging = config.getboolean('surrogate', 

'enable_remote_commands') and config.getboolean('surrogate', 'debug') 

# Amount of time that clients have to establish a connection to a 

# coupled Surrogate before the Surrogate is considered abandoned and 

# killed. 

self._connect_timeout_secs = config.getint('surrogate_manager', 

'connect_timeout_secs') 

self._next_ctrl_command_id = 1 

self._pending_ctrl_commands = {} 

self._prior_stats = None 

super(SurrogateContainer, self).__init__( 

# FIXME: SECURITY: we shouldn't expose secret_rid to Container 

# (the xauth_secret gets written to the .Xauthority file 

# accessible from within the Container). Consider using a hash 

# of secret_rid instead (that's not safe_rid). 

self.secret_rid, 

# TODO: make this a singleton instance. 

ns_pool_man, 

os.path.join(config.get('surrogate_manager', 

'chroot_home'), 

self.secret_rid), 

hostname=self.safe_rid, 

# SECURITY NOTE: While tasks in the container won't have any 

# capabilities by default, this parameter bounds what caps a 

# contained task may acquire via execve, such as those the kernel 

# automatically gives to setsuid programs. 

# 

# The most secure configuration is to let this set be empty, 

# which would prevent execve'd programs (including setsuid 

# programs) from grabbing more caps than they need (thus the name 

# 'bounding set'). However, the chrome_sandbox is a setsuid 

# program that requires some powerful caps to create sandboxed 

# processes. So the bound set in our use-case cannot be fully 

# empty. 

# 

# To be secure, we must ensure that chrome_sandbox is the only 

# setsuid program run inside the container (which we do via the 

# 'nosuid' mount configuration). 

bounding_caps=[CAP_SYS_ADMIN, CAP_SYS_CHROOT], 

mounts=ChromiumMounts(enable_shell=self._allow_container_debugging), 

limits={ 

resource.RLIMIT_NPROC: config.getint('surrogate', 

'rlimit_nproc'), 

resource.RLIMIT_CORE: (resource.RLIM_INFINITY if 

config.getboolean('service', 

'enable_core_dumps') 

else 0), 

# Surrogate will dynamically adjust its scheduling priority / 

# nice levels. By default, non-root users are not allowed to 

# increase priority once lowered (even for the process that 

# lowered it). But this restriction can be bypassed by setting 

# the rlimit. Here we allow it to increase its priority back 

# to its starting point (nice level 0). 

# 

# Note that, regardless of how Surrogate manages its nice 

# levels, we expect cgroups fair sharing to protect other 

# Surrogates. 

resource.RLIMIT_NICE: 20 

}, 

resolv_conf=hosts_to_resolv_conf(self.SURROGATE_DNS_HOSTS)) 

self._creation_time = time.time() 

self._coupled_time = 0 

# This holds a copy of the Surrogate's config. It's set when the 

# Surrogate is coupled. 

self._config = {} 

# Monotonic time if there is a pending connect deadline, or None if 

# there is no pending deadline. 

self._gc_deadline = None 

self._prior_active_conns = [] 

# Initialise surrogate-related members. 

self._surrogate_log_fd = surrogate_log_fd 

assert(type(self._surrogate_log_fd) == type(0)) 

 

def get_extra_logger_fields(self): 

"""Parent override.""" 

extra = super(SurrogateContainer, self).get_extra_logger_fields() 

extra.append(('surrogate_id', self.safe_rid, 14)) 

if self.safe_bid: 

extra.append(('browser_id', self.safe_bid, 14)) 

return extra 

 

def get_logger_name(self): 

"""Parent override.""" 

return 'sm-surrogate' 

 

def _report_event(self, event_type, data=None): 

if data is None: 

data = {} 

 

er = EventReporter.instance() 

if (not er.is_resource_collection_enabled or 

event_type not in er.events_to_watch): 

return 

data['browser_id'] = self.safe_bid 

data['session_id'] = self.safe_rid 

data['client_ip'] = self._config.get('client_ip', 'unknown') 

er.report(event_type, data) 

 

def _set_state(self, new_state): 

assert(isinstance(new_state, int)) 

self.log.info('old_state=%s new_state=%s', self.state, new_state, 

event='state-change') 

# Security: do not permit transitions to earlier states. 

if self.state is not None and new_state <= self.state: 

self.kill('moving-to-same-or-lower-state-prohibited') 

return 

self.state = new_state 

if self._transition_timeout: 

self._io_loop.remove_timeout(self._transition_timeout) 

self._transition_timeout = None 

state_timeout_secs = self.TRANSITION_TIMEOUTS[self.state] 

if state_timeout_secs: 

self._transition_timeout = self._io_loop.call_later( 

state_timeout_secs, self._on_state_transition_timeout) 

if self._state_change_cb: 

try: 

self._state_change_cb(self) 

except Exception: 

self.log.exception('state change callback threw exception') 

 

def _on_state_transition_timeout(self): 

self.log.warning('state=%s', self.state, 

event='transition-timeout') 

self.kill('transition-timeout') 

 

def set_coupling_deadline(self): 

"""Self-destruct if not coupled within @secs seconds.""" 

assert(not self._transition_timeout) 

assert(self.state == self.STATE_READY) 

self._set_state(self.STATE_AWAITING_COUPLING) 

 

def _create_ctrl_sock(self): 

"""Creates a filesystem-visible UNIX socket in a place that both the 

Surrogate and its parent (current process) can see.""" 

# The Surrogate's home directory seems like a good choice. 

unix_bind_name = os.path.join(self._home_dir, self.CONTROL_SOCKET_NAME) 

self.log.debug('Listening for connections on %s...', 

unix_bind_name) 

safly.misc.safe_makedirs(os.path.dirname(unix_bind_name)) 

sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 

sock.bind(unix_bind_name) 

sock.listen(1) 

# Give Surrogate exclusive ownership so that only it can connect to 

# this socket. 

os.chown(unix_bind_name, self._uid, self._gid) 

os.chmod(unix_bind_name, 0600) 

return sock 

 

def _start_extra_debugging_tools(self): 

self.log.info('VNC server inside sandbox to be started at %s:5900, ' 

'use ssh tunnel to access this address and port remotely.', 

self._netns_handle.addr) 

pid = os.fork() 

if pid: 

return 

 

pid = os.fork() 

# Start x11vnc 

if pid: 

os_helpers.secure_exec('/usr/bin/x11vnc', 

['-forever', '-shared', '-display', 

self._get_fully_qualified_display_name()], 

env=os.environ) 

# unreachable 

assert(False) 

 

pid = os.fork() 

# Start xterm. 

if pid: 

os_helpers.secure_exec('/usr/bin/xterm', [], env=os.environ) 

# unreachable 

assert(False) 

 

# Start the simplest window manager. 

os_helpers.secure_exec('/usr/bin/openbox', [], env=os.environ) 

# unreachable 

assert(False) 

 

def start(self, *args, **kwargs): 

"""Parent override: start the container.""" 

# Set up the control sock before we start the Container to avoid 

# races: Surrogate could race ahead and try to connect to the 

# control socket before we've had a chance to start listening. 

self._ctrl_sock = self._create_ctrl_sock() 

self._io_loop.add_handler(self._ctrl_sock.fileno(), 

self._on_ctrl_sock_connect_req, 

self._io_loop.READ) 

# Fork the initial container process. 

super(SurrogateContainer, self).start(*args, **kwargs) 

# We don't get notification when the container dies, so we need to 

# poll for its death. 

self._periodic_clocks['reap'] = tornado.ioloop.PeriodicCallback( 

self._check_for_death_and_reap, self.REAP_INTERVAL_MS) 

# Two clocks for stats: one for logging only on resource changes and 

# the other for a periodic stats dump. 

self._periodic_clocks['stats'] = tornado.ioloop.PeriodicCallback( 

self._on_stats_timer, 1000) 

self._periodic_clocks['stats_logger'] = tornado.ioloop.PeriodicCallback( 

functools.partial(self._on_stats_timer, True), 

self.STATS_LOGGING_INTERVAL_MS) 

for pc in self._periodic_clocks.values(): 

pc.start() 

self._set_state(self.STATE_STARTED) 

 

def _on_connect_check_timer(self): 

def get_all_conns(): 

"""Gets all TCP connections into the container. 

 

This is resilient to the container not existing due to various 

termination races.""" 

try: 

return os_helpers.get_tcp4_connections(self.pid) 

except Exception: 

return {} 

 

active_conns = [(local_addr, remote_addr, status) 

for local_addr, remote_addr, status in 

get_all_conns() if 

# Filter out connections originating from the 

# container itself. 

remote_addr[0] != self.get_ip_addr() and 

# Filter out non-SurrogateServer connections. 

local_addr[1] == self.SURROGATE_HTTPD_PORT and 

# Filter out sockets in intermediate states like 

# CLOSE_WAIT (e.g., could result if SurrogateRouter 

# drops its end of the connection). 

status == os_helpers.CONN_STATUS_ESTABLISHED] 

if active_conns != self._prior_active_conns: 

self.log.info({'active_conns': active_conns, 

'count': len(active_conns)}, 

event='active-connections-changed') 

self._prior_active_conns = active_conns 

if active_conns: 

if self._gc_deadline: 

self.log.info({'active_conns': active_conns}, 

event='gc-deadline-cancelled') 

self._gc_deadline = None # Active connections ==> not abandoned. 

elif not self._gc_deadline: 

self._gc_deadline = monotonic() + self._connect_timeout_secs 

self.log.info({'deadline': self._gc_deadline, 

'now': monotonic()}, event='gc-deadline-set') 

if self._gc_deadline and (monotonic() >= self._gc_deadline): 

self.log.error({'connect_timeout_secs': self._connect_timeout_secs, 

'gc_deadline': self._gc_deadline, 

'monotonic': monotonic()}, 

event='abandoned') 

self._gc_deadline = None 

self.kill('abandoned') 

 

def get_ip_addr(self): 

return self._netns_handle.addr 

 

def setup_env(self, *args, **kwargs): 

"""Parent override: set up any custom environment vars if needed.""" 

super(SurrogateContainer, self).setup_env(*args, **kwargs) 

# The LD_LIBRARY_PATH inherited from the host screws up fonts in 

# SVCR, so make sure it didn't get propagated. 

assert('LD_LIBRARY_PATH' not in os.environ) 

 

def pre_chroot_setup(self): 

"""Parent override: setup additional filesystem state for surrogate.""" 

cas_path = config.get('general', 'user_installed_cas_path') 

db_path = os.path.join(cas_path, 'nssdb') 

if os.path.exists(db_path): 

dest_path = os.path.join(self._home_dir, '.pki') 

os.makedirs(dest_path) 

dest_db_path = os.path.join(dest_path, 'nssdb') 

shutil.copytree(db_path, dest_db_path) 

for root, _, files in os.walk(dest_path): 

os.chown(root, self._uid, self._gid) 

for f in files: 

os.chown(os.path.join(root, f), self._uid, self._gid) 

 

def on_start(self): 

"""Parent override: this gets executed inside the container by 

the first container process. Take this opportunity to launch the 

Surrogate.""" 

super(SurrogateContainer, self).on_start() 

disable_features = ['CredentialManagementAPI', 'PointerEvent'] 

if not config.getboolean('surrogate', 'ssl_validate_cert_san'): 

disable_features.append('SSLCommonNameMismatchHandling') 

sv_cr_args = [ 

'--monitor-js-path=%s' % config.get('service', 

'sv_cr_monitor_path'), 

# TODO: add container API to get the IP instead of reaching 

# into private state. 

'--surrogate-server-address=%s' % self._netns_handle.addr, 

'--surrogate-server-port=%s' % self.SURROGATE_HTTPD_PORT, 

# Not decoding / painting video frames improves performance. 

'--no-video-decode', 

# alsa string format example: 'hw:1,0,1', so use ';' as delimiter. 

'--alsa-output-device-list=%s' % ';'.join(self._alsa_device_list), 

# Disable advertising credential manager until implemented 

'--disable-features=%s' % ','.join(disable_features), 

# Our VMs don't have GPU support. 

'--disable-gpu', 

'--disable-smooth-scrolling', 

'--display=%s' % self._get_fully_qualified_display_name(), 

'--safeview-safe-rid=%s' % self.safe_rid, 

'--safeview-control-socket-file=%s' % os.path.join(self._home_dir_mount, 

self.CONTROL_SOCKET_NAME)] 

if not config.getboolean('surrogate', 'enable_setuid_sandbox'): 

sv_cr_args.append('--disable-setuid-sandbox') 

if config.getboolean('surrogate', 'debug'): 

sv_cr_args.append('--v=1') 

if config.getboolean('surrogate', 'enable_inspector'): 

sv_cr_args.append('--remote-debugging-port=%s' % config.get( 

'surrogate', 'debug_port')) 

if not config.getboolean('policy-enforcement-server', 'enabled'): 

sv_cr_args.append('--disable-policy-enforcement') 

if config.getboolean('service', 'global_flash_on'): 

sv_cr_args.append('--global-flash-on') 

if config.getboolean('reporting', 'enable_event_reporting'): 

sv_cr_args.append('--enable-event-reporting') 

extra_args = config.get('surrogate_manager', 'extra_chromium_args') 

if extra_args: 

sv_cr_args += [arg.strip() for arg in extra_args.split(',')] 

self.log.debug('sv_cr_args = %s', sv_cr_args) 

# FileServer needs read-access to the Downloads dir. If we let 

# Chromium create it, it won't give group read permission. 

download_dir = os.path.join(self._home_dir_mount, 'Downloads') 

safly.misc.safe_makedirs(download_dir) 

os.chmod(download_dir, 0750) 

if self._allow_container_debugging: 

self._start_extra_debugging_tools() 

 

# Redirect stdout/stderr to the surrogate log 

for fd_redir_now in (1, 2): 

try: 

os.dup2(self._surrogate_log_fd, fd_redir_now) 

except Exception as e: 

self.log.error({'error': e, 'fd_src': self._surrogate_log_fd, 

'fd_dest': fd_redir_now}, 

event='log-redirect-failed', opslog=True) 

raise 

 

os_helpers.secure_exec(config.get('service', 'sv_cr_path'), 

sv_cr_args, 

# Close everything but stdin/stdout/stderr. 

fds_to_leave_open={0, 1, 2}, 

env=os.environ) 

 

def _check_for_death_and_reap(self): 

"""Check if the Surrogate is dead, and if so, reap (i.e., wait()) 

it. This must be done to avoid zombies and to ensure disposal of 

Surrogate file-system state (e.g., downloads).""" 

assert(self.pid > 0) 

# WNOHANG ==> don't block if @pid isn't actually dead. 

_pid, status = self.wait(flags=os.WNOHANG) 

if self.pid == _pid: 

self.log.info({'pid': self.pid, 

'status': status, 

'exit_code': os.WEXITSTATUS(status), 

'term_sig': os.WTERMSIG(status), 

'core_dumped': os.WCOREDUMP(status)}, 

event='reaped') 

 

def _on_ctrl_sock_connect_req(self, fileno, ev): 

self.log.info('fileno=%s event=%s', fileno, ev, 

event='ctrl-sock-connect') 

assert(fileno == self._ctrl_sock.fileno()) 

client_sock = self._ctrl_sock.accept()[0] 

if self._ctrl_stream: 

self.kill('control-sock-already-connected') 

self._ctrl_stream = tornado.iostream.IOStream(client_sock, 

io_loop=self._io_loop) 

self._ctrl_stream.set_close_callback(self._on_ctrl_sock_closed) 

self._ctrl_stream.read_until('\n', self._on_ctrl_message) 

 

def _on_ctrl_sock_closed(self): 

self.log.info('', event='ctrl-sock-closed') 

# The assumption here is that the Surrogate won't close its control 

# socket until it has finished sync'ing any state. 

self.kill() 

 

def _do_router_setup(self): 

"""Setup container routing rules.""" 

assert(self._netns_handle) 

assert(self.pid) 

fw_spec = config.get('surrogate_manager', 'forwarded_ports').strip() 

fw_ports = [] 

if fw_spec: 

fw_ports = [tuple(tup.split(':')) for tup in fw_spec.split(',')] 

is_upstream_proxy = self._config.get('is_upstream_proxy', False) 

if 'proxy_config' in self._config: 

proxy_port = int((self._config['proxy_config']).split(':')[1]) 

else: 

proxy_port = None 

bounce_devs = config.get('surrogate_manager', 

'bounce_devices').strip().split(',') 

# No bounce-all ===> use the default gateway/route. 

if not self._config.get('bounce_all', False): 

# Auto-select the default device (i.e., the one associated with the 

# default network route). 

bounce_devs = ['default'] 

# Use virtual proxy IP only if upstream proxy is in use. 

if is_upstream_proxy: 

upstream_proxy_config = self._config.get('proxy_config') 

else: 

upstream_proxy_config = None 

gwr = self.ROUTER_KLASS_BY_NAME['gateway'](self, bounce_devs, fw_ports, 

upstream_proxy_config, 

io_loop=self._io_loop) 

self._routers['gwr'] = gwr 

gwr.start() 

 

allowed_ports = [# for upload api access from the surrogate 

('tcp', config.getint('file-server', 

'port')), 

('tcp', config.getint('policy-enforcement-server', 

'port')), 

# FIXME: SECURITY: not a good idea to give direct 

# access to Redis API due to large attack surface. 

# Is this needed in Chromium? 

('tcp', config.getint('reporting', 

'redis_server_port')), 

# Surrogate needs to access Safedocs directly 

# through 10.3.0.1:8081 

('tcp', config.getint('docview', 'port'))] 

if proxy_port and not is_upstream_proxy: 

allowed_ports.append(('tcp', proxy_port)) 

# FIXME: SECURITY: best never to expose services on the host, even for 

# test deployments. Consider moving the test page web server out of the 

# deployment. 

if config.getboolean('service', 'serve_test_pages'): 

allowed_ports.extend([('tcp', config.getint('director', 

'opaque_http_port')), 

('tcp', config.getint('director', 

'opaque_https_port'))]) 

input_spec = config.get('surrogate_manager', 

'extra_input_ports').strip() 

if input_spec: 

allowed_ports.extend([tuple(tup.split(':')) for tup in 

input_spec.split(',')]) 

ifr = self.ROUTER_KLASS_BY_NAME['firewall'](self, allowed_ports) 

self._routers['ifr'] = ifr 

ifr.start() 

 

def _complete_coupling(self, resp): 

"""Let Surrogate know that its container has been set up. 

 

This includes sending Surrogate its browser state (e.g., cookies).""" 

if resp.error: 

self.log.error({'error': resp.error}, 

event='browser-state-fetch-failed') 

else: 

self.log.info({'body_len': len(resp.body) if resp.body else 0}, 

event='browser-state-loaded') 

# Let Surrogate know that coupling is complete. It won't start 

# network activity (e.g., page load) until it gets this. 

ack = {'c': 'a', 'j': resp.body or json.dumps({}), 

'f': self._flash_list, 's': self._site_flags} 

if self._config.get('is_upstream_proxy', False): 

proxy_port = int((self._config['proxy_config']).split(':')[1]) 

proxy_config = '%s:%s' % (self.SURROGATE_PROXY_IP, proxy_port) 

else: 

# TODO: eliminate this path once we get rid of SquidLocal. 

proxy_config = self._config.get('proxy_config', None) 

if proxy_config: 

ack['pc'] = proxy_config 

self._send_command(ack) 

self._set_state(self.STATE_COUPLED) 

 

def _fetch_browser_state(self, bid): 

"""Get browser state from the BrowserStateServer, relay to Surrogate. 

 

Currently, browser state refers to cookies, but could include 

LocalStorage content (and more) in the future.""" 

http_client = tornado.curl_httpclient.CurlAsyncHTTPClient() 

url = urlparse.urljoin(STATE_SERVER_URL, '/pull') 

req = tornado.httpclient.HTTPRequest(url, method='POST', 

body=json.dumps({'bid': bid})) 

http_client.fetch(req, self._complete_coupling) 

 

def _pnr_update(self, pnr_info): 

self._send_command({'c': 'p', 'f': pnr_info['flash_list'], 

's': pnr_info['site_flags']}) 

 

def _on_surrogate_coupling_request(self): 

"""Do container setup that needs to be customized on a per user basis. 

 

Examples include routing setup (due to bounce tunnel support), and 

fetching cookies.""" 

def on_pnr_future_received(resp): 

self._flash_list = resp.result()['flash_list'] or [] 

self._site_flags = resp.result()['site_flags'] or {} 

self._fetch_browser_state(self._config['bid']) 

 

self.log.info({'config': dict(self._config.items() + 

{'bid': 'redacted'}.items())}, 

event='got-coupling-request') 

# Start checking for active connections. 

self._periodic_clocks['connect_check'] = tornado.ioloop.PeriodicCallback( 

self._on_connect_check_timer, 

config.getint('surrogate_manager', 

'connect_check_interval_secs') * 1000) 

self._periodic_clocks['connect_check'].start() 

self._do_router_setup() 

tornado.ioloop.IOLoop.instance().add_future( 

PnrObserver.instance().register( 

self._config['tenant_id'], self._pnr_update), on_pnr_future_received) 

 

 

def _on_state_sync_requested(self): 

"""Tell BrowserStateServer to persist this Surrogate's state.""" 

def on_state_sync_completed(resp): 

"""BrowserStateServer has persisted changes, so let Surrogate 

know.""" 

if resp.error: 

self.log.error({'error': resp.error}, 

event='browser-state-sync-failed') 

else: 

self.log.info({'body': resp.body}, 

event='browser-state-sync-complete') 

# Let Surrogate know. It may use this as an indication that it is 

# safe to shut down. 

self._send_command({'c': 'A', 'r': not resp.error}) 

 

http_client = tornado.curl_httpclient.CurlAsyncHTTPClient() 

url = urlparse.urljoin(STATE_SERVER_URL, '/save') 

connect_timeout, request_timeout = self.REQ_TIMEOUT_BY_KIND['state-sync'] 

req = tornado.httpclient.HTTPRequest(url, method='POST', 

connect_timeout=connect_timeout, 

request_timeout=request_timeout, 

body=json.dumps({'bid': 

self._config['bid']})) 

http_client.fetch(req, on_state_sync_completed) 

 

def _on_browser_state_update_from_surrogate(self, action): 

"""Surrogate wants us to save browser state for future sessions.""" 

assert(isinstance(action, list)) 

def on_resp(resp): 

if resp.error: 

self.log.error({'error': resp.error}, 

event='browser-state-update-failed') 

# Quite a few of these. 

self.log.debug({'action': action}, event='browser-state-update') 

http_client = tornado.curl_httpclient.CurlAsyncHTTPClient() 

url = urlparse.urljoin(STATE_SERVER_URL, '/push') 

params = {'bid': self._config['bid'], 'action': action} 

req = tornado.httpclient.HTTPRequest(url, method='POST', 

body=json.dumps(params)) 

http_client.fetch(req, on_resp) 

 

def _report_resource_load(self, msg): 

"""Notify event consumer of message from surrogate indicating a resource load.""" 

for data in msg.get('events', []): 

self._report_event('webview_resource_request', data=data) 

self.log.debug(data, event='webview-resource-request', opslog=False) 

 

# pylint: disable=too-many-branches,too-many-return-statements 

def _handle_ctrl_message(self, msg): 

"""Process status update messages coming from the Surrogate.""" 

self.log.debug('Got control message %s', msg) 

try: 

msg = json.loads(msg) 

except ValueError: 

self.log.error('msg=%s details="Bad JSON"', 

msg, event='bad-ctrl-msg') 

return 

if not isinstance(msg, dict) or 'c' not in msg: 

self.log.error('msg=%s details="Bad JSON"', 

msg, event='bad-ctrl-msg') 

return 

cmd = msg['c'] 

if cmd == 'R': # Ready for new connections. 

self._set_state(self.STATE_READY) 

self._version = msg.get('s', 'unknown') 

self._cr_version = msg.get('v', 'unknown') 

self._flash_version = msg.get('f', 'unknown') 

elif cmd == 'C': # Coupling initiation (includes config) 

if 'v' not in msg or 'tenant_id' not in msg['v']: 

self.log.error('msg=%s details="Missing config"', 

msg, event='bad-ctrl-msg') 

return 

self._config = json.loads(msg['v']) 

self.safe_bid = self._config['safe_bid'] 

self._coupled_time = time.time() 

try: 

self._on_surrogate_coupling_request() 

except Exception: 

self.log.exception({}, event='coupling-setup-failed') 

self.kill('coupling-setup-threw') 

elif cmd == 'J': 

if 'a' not in msg: 

self.log.error('msg=%s details="Missing cookie server command"', 

msg, event='bad-ctrl-msg') 

return 

self._on_browser_state_update_from_surrogate(msg['a']) 

elif cmd == 'S': 

self._on_state_sync_requested() 

elif cmd == 'r': # Command response. 

if 'i' not in msg or 'v' not in msg: 

if 'i' in msg: 

future = self._pending_ctrl_commands.pop(msg['i'], None) 

if future: 

future.set_exception(RuntimeError( 

'Bad response from surrogate')) 

self.log.error('msg=%s details="Missing id or value in response"', 

msg, event='bad-ctrl-msg') 

return 

future = self._pending_ctrl_commands.pop(msg['i'], None) 

if not future: 

self.log.error('msg=%s details="Unknown id in response"', 

msg, event='bad-ctrl-msg') 

return 

future.set_result(msg['v']) 

elif cmd == 'M': # Max tabs reached 

self._report_event('tab_limit_exceeded', 

data={'limit': self._config['tab_limit']}) 

self.log.info({'limit': self._config['tab_limit']}, 

event='tab-limit-reached', opslog=True) 

elif cmd == 'w': # Resource load notification 

self._report_resource_load(msg) 

else: 

self.log.error('msg=%s details="Unhandled message"', msg, 

event='bad-ctrl-msg') 

return 

if self._ctrl_stream: 

self._ctrl_stream.read_until('\n', self._on_ctrl_message) 

 

def _on_ctrl_message(self, msg): 

"""Handle the control message sent by the Surrogate.""" 

try: 

self._handle_ctrl_message(msg) 

except Exception: 

self.log.exception({'msg': msg}, 

event='handle-ctrl-message-failed') 

self.kill('handle-ctrl-message-failed') 

 

def release(self, *args, **kwargs): 

"""Parent override: container is dead; clean up Surrogate-specific 

state, especially that allocated on start().""" 

super(SurrogateContainer, self).release(*args, **kwargs) 

if self._config and 'tenant_id' in self._config: 

PnrObserver.instance().deregister(self._config['tenant_id'], self._pnr_update) 

if self._ctrl_sock: 

# CAREFUL: be sure to deregister the ctrl socket handler since 

# Python will not GC the Container object while Tornado IOLoop holds 

# a ref to it. 

self._io_loop.remove_handler(self._ctrl_sock.fileno()) 

# Not strictly necessary since ctrl_sock will be automatically 

# closed in class destructor, but I'm keeping this in for 

# consistency. 

try: 

self._ctrl_sock.close() 

except Exception: 

self.log.warning('failed to close control server socket', 

event='ctrl-sock-close-failed') 

self._ctrl_sock = None 

if self._ctrl_stream: 

try: 

self._ctrl_stream.close() 

except Exception: 

self.log.warning('failed to close control stream', 

event='ctrl-stream-close-failed') 

self._ctrl_stream = None 

for k, pc in self._periodic_clocks.items(): 

pc.stop() 

self._periodic_clocks.pop(k) 

for router_name in self._routers.keys(): 

router = self._routers.pop(router_name, None) 

try: 

router.stop() 

except Exception: 

self.log.error({'instance': router}, event='router-close-failed') 

self._set_state(self.STATE_DEAD) 

 

def get_status_dict(self): 

return { 

'container_stats': self.get_stats(), 

'config': self._config, 

'coupled_time': self._coupled_time, 

'creation_time': self._creation_time, 

'safe_rid': self.safe_rid, 

'state': self._state, 

'safe_bid': self.safe_bid, 

'version': self._version, 

'cr_version': self._cr_version, 

'flash_version': self._flash_version, 

'pid': self.pid, 

'unix_username': self._username, 

'cgroup': self._cg_handle.name, 

'task_pids': list(self._cg_handle.get_pids()), 

'ip': (self._netns_handle.addr if self._netns_handle else 'unknown')} 

 

def _send_command(self, cmd, response_expected=False): 

cmd_id = self._next_ctrl_command_id 

self._next_ctrl_command_id += 1 

cmd_final = {'i': cmd_id} 

cmd_final.update(cmd) 

if not self._ctrl_stream: 

self.log.warning({}, event='ctrl-sock-already-closed') 

if response_expected: 

future = tornado.concurrent.Future() 

future.set_exception(IOError('no channel')) 

return future 

return 

 

self._ctrl_stream.write('%s\n' % json.dumps(cmd_final)) 

if response_expected: 

future = tornado.concurrent.Future() 

self._pending_ctrl_commands[cmd_id] = future 

return future 

return None 

 

def get_details(self): 

return self._send_command({'c': 'd'}, True) 

 

@tornado.gen.coroutine 

def detach_tab(self, tab_id): 

res = yield self._send_command({'c': 'D', 't': int(tab_id)}, True) 

self.log.info({'tab_id': tab_id, 'res': res}, event='tab-detach') 

if res == 'NO_TAB': 

raise KeyError(tab_id) 

elif res == 'OK': 

return 

else: 

raise Exception(res) 

 

def _gc_if_unresponsive(self): 

"""Could happen if browser process gets stuck for whatever reason.""" 

# TODO: have Surrogate send us keepalive on an auxiliary channel; 

# the lack of keepalive would indicate something is wrong. This was 

# super important for WebKit since it didn't implement this 

# internally; Chromium does, however. 

pass 

 

def _on_stats_timer(self, force_log=False): 

def get_changed_keys(): 

if not self._prior_stats: 

return [] 

changed_keys = [] 

for k, thresh in self.THRESHOLD_BY_STAT.items(): 

# Some stats may not be available on this machine (e.g., 

# total_swap won't be available if there is no swap configured). 

if not (k in stats and k in self._prior_stats): 

continue 

if abs(stats[k] - self._prior_stats[k]) > thresh: 

changed_keys.append(k) 

return changed_keys 

 

stats = self.get_stats() 

changed_keys = get_changed_keys() 

if force_log or changed_keys: 

stats.update({'changed_keys': changed_keys}) 

self.log.info(stats, event='container-stats') 

if 'total_pgmajfault' in changed_keys: 

self.log.warning(stats, event='thrashing') 

self._prior_stats = stats 

 

def __repr__(self): 

return self.safe_rid