Coverage for surrogate.py : 62%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
############################################################################# # # Represents a single sandboxed Surrogate. # # Copyright (C) 2015-17 Menlo Security, Inc. # All rights reserved. # ##############################################################################
# BrowserStateServer is assumed to be running in the same node and network # namespace as us -- there's one for each SurrogateManager. 'port'))
"""A minimalistic set of mounts, just enough to support running the Chromium backend.""" # The config item holds a directory, so there's no need to mount its # parent directory (which could coincide with the dirname for # 'service.sv_cr_path' (see below)). # Setuid execution should be enabled only for the directory in which # the chrome_sandbox resides. Nothing else can be trusted with that # bit due to our relaxed capability bounding set. # # Note: in the default configuration, 'service.sv_cr_monitor_path' is a # subdirectory of the variable sv_cr_path # (which is os.path.dirname('service', 'sv_cr_path')), so in effect the # "monitor" path would in that case have the extra permissions granted by # the BindMount instantiation (below). The explicit BindMount # instantiation gives the right access to the "monitor", regardless of # its (relative) location (to 'service.sv_cr_path'). raise Exception('Insecure mount option found: setuid enabled') # Must allow suid execution for chrome_sandbox. allow_suid=True), BindMount(sv_cr_monitor_path, sv_cr_monitor_path), # Fontconfig files: without this Surrogate will # issue 'Fontconfig error: Cannot load default # config file'. BindMount('/etc/fonts', '/etc/fonts'), # Optimization: so that font cache needn't be # regenerated on each Surrogate. BindMount('/var/cache/fontconfig', '/var/cache/fontconfig')])
"""Converts a list of hosts to resolv.conf format."""
# Surrogate states: transitions must occur from lower states to higher # ones. STATE_DEAD = range(5) # Surrogate must transition to a higher state before the timeout # for the current state expires. Zero indicates infinite timeout (i.e., # it may linger there forever). 'tt_started_secs'), STATE_READY: 0, STATE_AWAITING_COUPLING: config.getint('surrogate_manager', 'tt_awaiting_coupling_secs'), STATE_COUPLED: 0, STATE_DEAD: 0} 'http_server_port') # Frequency at which to check for dead Surrogates and reap them. # We log stats if the following metrics change by the following # thresholds. The key names must match the keys in surrogate.get_stats(). # # 'total_' prefix indicates that stat includes current group and all # descendant cgroups. 'stats_logging_thresholds') # Minimum rate at which we print stats. It may be printed at higher rates # due to significant resource changes (as defined by thresholds above), # but this rate is guaranteed. 'stats_logging_interval_ms') # To support dynamic DNS configuration (e.g., on a per gateway-device # basis), we have all Surrogates use a fixed set of private IPs for DNS # requests, and then we use IPTables packet-mangling (DNAT) to route the # packets to the real DNS servers. The alternative approach of making the # Surrogate aware of DNS changes proved susceptible to races---the # Chromium public API offered no clean way to ensure that DNS changes # took effect. # Proxy that Surrogate should use. This is a virtual IP that is DNAT'd on # the host to the actual proxy IP address (which could be dynamic).
# Directory of software routers. Exposed to facilitate mocking. 'firewall': net.HostInputFirewall}
# HTTP requests made to external services should abide by these # timeouts. The format here is (connect_timeout, total_request_timeout). # # Justification: # # State sync may take a lot of time if there is a lot of state.
surrogate_log_fd=None): # Use the full qualified name for node_id so nosetest can mock it. # These periodic timers get cleaned up automatically on container # release. 'enable_remote_commands') and config.getboolean('surrogate', 'debug') # Amount of time that clients have to establish a connection to a # coupled Surrogate before the Surrogate is considered abandoned and # killed. 'connect_timeout_secs') # FIXME: SECURITY: we shouldn't expose secret_rid to Container # (the xauth_secret gets written to the .Xauthority file # accessible from within the Container). Consider using a hash # of secret_rid instead (that's not safe_rid). self.secret_rid, # TODO: make this a singleton instance. ns_pool_man, os.path.join(config.get('surrogate_manager', 'chroot_home'), self.secret_rid), hostname=self.safe_rid, # SECURITY NOTE: While tasks in the container won't have any # capabilities by default, this parameter bounds what caps a # contained task may acquire via execve, such as those the kernel # automatically gives to setsuid programs. # # The most secure configuration is to let this set be empty, # which would prevent execve'd programs (including setsuid # programs) from grabbing more caps than they need (thus the name # 'bounding set'). However, the chrome_sandbox is a setsuid # program that requires some powerful caps to create sandboxed # processes. So the bound set in our use-case cannot be fully # empty. # # To be secure, we must ensure that chrome_sandbox is the only # setsuid program run inside the container (which we do via the # 'nosuid' mount configuration). bounding_caps=[CAP_SYS_ADMIN, CAP_SYS_CHROOT], mounts=ChromiumMounts(enable_shell=self._allow_container_debugging), limits={ resource.RLIMIT_NPROC: config.getint('surrogate', 'rlimit_nproc'), resource.RLIMIT_CORE: (resource.RLIM_INFINITY if config.getboolean('service', 'enable_core_dumps') else 0), # Surrogate will dynamically adjust its scheduling priority / # nice levels. By default, non-root users are not allowed to # increase priority once lowered (even for the process that # lowered it). But this restriction can be bypassed by setting # the rlimit. Here we allow it to increase its priority back # to its starting point (nice level 0). # # Note that, regardless of how Surrogate manages its nice # levels, we expect cgroups fair sharing to protect other # Surrogates. resource.RLIMIT_NICE: 20 }, resolv_conf=hosts_to_resolv_conf(self.SURROGATE_DNS_HOSTS)) # This holds a copy of the Surrogate's config. It's set when the # Surrogate is coupled. # Monotonic time if there is a pending connect deadline, or None if # there is no pending deadline. # Initialise surrogate-related members.
"""Parent override."""
"""Parent override."""
data = {}
event_type not in er.events_to_watch): return
assert(isinstance(new_state, int)) event='state-change') # Security: do not permit transitions to earlier states. self.kill('moving-to-same-or-lower-state-prohibited') return state_timeout_secs, self._on_state_transition_timeout) except Exception: self.log.exception('state change callback threw exception')
self.log.warning('state=%s', self.state, event='transition-timeout') self.kill('transition-timeout')
"""Self-destruct if not coupled within @secs seconds.""" assert(not self._transition_timeout) assert(self.state == self.STATE_READY) self._set_state(self.STATE_AWAITING_COUPLING)
"""Creates a filesystem-visible UNIX socket in a place that both the Surrogate and its parent (current process) can see.""" # The Surrogate's home directory seems like a good choice. unix_bind_name) # Give Surrogate exclusive ownership so that only it can connect to # this socket.
self.log.info('VNC server inside sandbox to be started at %s:5900, ' 'use ssh tunnel to access this address and port remotely.', self._netns_handle.addr) pid = os.fork() if pid: return
pid = os.fork() # Start x11vnc if pid: os_helpers.secure_exec('/usr/bin/x11vnc', ['-forever', '-shared', '-display', self._get_fully_qualified_display_name()], env=os.environ) # unreachable assert(False)
pid = os.fork() # Start xterm. if pid: os_helpers.secure_exec('/usr/bin/xterm', [], env=os.environ) # unreachable assert(False)
# Start the simplest window manager. os_helpers.secure_exec('/usr/bin/openbox', [], env=os.environ) # unreachable assert(False)
"""Parent override: start the container.""" # Set up the control sock before we start the Container to avoid # races: Surrogate could race ahead and try to connect to the # control socket before we've had a chance to start listening. self._on_ctrl_sock_connect_req, self._io_loop.READ) # Fork the initial container process. # We don't get notification when the container dies, so we need to # poll for its death. self._check_for_death_and_reap, self.REAP_INTERVAL_MS) # Two clocks for stats: one for logging only on resource changes and # the other for a periodic stats dump. self._on_stats_timer, 1000) functools.partial(self._on_stats_timer, True), self.STATS_LOGGING_INTERVAL_MS)
"""Gets all TCP connections into the container.
This is resilient to the container not existing due to various termination races.""" except Exception: return {}
for local_addr, remote_addr, status in get_all_conns() if # Filter out connections originating from the # container itself. remote_addr[0] != self.get_ip_addr() and # Filter out non-SurrogateServer connections. local_addr[1] == self.SURROGATE_HTTPD_PORT and # Filter out sockets in intermediate states like # CLOSE_WAIT (e.g., could result if SurrogateRouter # drops its end of the connection). status == os_helpers.CONN_STATUS_ESTABLISHED] 'count': len(active_conns)}, event='active-connections-changed') self.log.info({'active_conns': active_conns}, event='gc-deadline-cancelled') elif not self._gc_deadline: self._gc_deadline = monotonic() + self._connect_timeout_secs self.log.info({'deadline': self._gc_deadline, 'now': monotonic()}, event='gc-deadline-set') self.log.error({'connect_timeout_secs': self._connect_timeout_secs, 'gc_deadline': self._gc_deadline, 'monotonic': monotonic()}, event='abandoned') self._gc_deadline = None self.kill('abandoned')
"""Parent override: set up any custom environment vars if needed.""" super(SurrogateContainer, self).setup_env(*args, **kwargs) # The LD_LIBRARY_PATH inherited from the host screws up fonts in # SVCR, so make sure it didn't get propagated. assert('LD_LIBRARY_PATH' not in os.environ)
"""Parent override: setup additional filesystem state for surrogate.""" cas_path = config.get('general', 'user_installed_cas_path') db_path = os.path.join(cas_path, 'nssdb') if os.path.exists(db_path): dest_path = os.path.join(self._home_dir, '.pki') os.makedirs(dest_path) dest_db_path = os.path.join(dest_path, 'nssdb') shutil.copytree(db_path, dest_db_path) for root, _, files in os.walk(dest_path): os.chown(root, self._uid, self._gid) for f in files: os.chown(os.path.join(root, f), self._uid, self._gid)
"""Parent override: this gets executed inside the container by the first container process. Take this opportunity to launch the Surrogate.""" super(SurrogateContainer, self).on_start() disable_features = ['CredentialManagementAPI', 'PointerEvent'] if not config.getboolean('surrogate', 'ssl_validate_cert_san'): disable_features.append('SSLCommonNameMismatchHandling') sv_cr_args = [ '--monitor-js-path=%s' % config.get('service', 'sv_cr_monitor_path'), # TODO: add container API to get the IP instead of reaching # into private state. '--surrogate-server-address=%s' % self._netns_handle.addr, '--surrogate-server-port=%s' % self.SURROGATE_HTTPD_PORT, # Not decoding / painting video frames improves performance. '--no-video-decode', # alsa string format example: 'hw:1,0,1', so use ';' as delimiter. '--alsa-output-device-list=%s' % ';'.join(self._alsa_device_list), # Disable advertising credential manager until implemented '--disable-features=%s' % ','.join(disable_features), # Our VMs don't have GPU support. '--disable-gpu', '--disable-smooth-scrolling', '--display=%s' % self._get_fully_qualified_display_name(), '--safeview-safe-rid=%s' % self.safe_rid, '--safeview-control-socket-file=%s' % os.path.join(self._home_dir_mount, self.CONTROL_SOCKET_NAME)] if not config.getboolean('surrogate', 'enable_setuid_sandbox'): sv_cr_args.append('--disable-setuid-sandbox') if config.getboolean('surrogate', 'debug'): sv_cr_args.append('--v=1') if config.getboolean('surrogate', 'enable_inspector'): sv_cr_args.append('--remote-debugging-port=%s' % config.get( 'surrogate', 'debug_port')) if not config.getboolean('policy-enforcement-server', 'enabled'): sv_cr_args.append('--disable-policy-enforcement') if config.getboolean('service', 'global_flash_on'): sv_cr_args.append('--global-flash-on') if config.getboolean('reporting', 'enable_event_reporting'): sv_cr_args.append('--enable-event-reporting') extra_args = config.get('surrogate_manager', 'extra_chromium_args') if extra_args: sv_cr_args += [arg.strip() for arg in extra_args.split(',')] self.log.debug('sv_cr_args = %s', sv_cr_args) # FileServer needs read-access to the Downloads dir. If we let # Chromium create it, it won't give group read permission. download_dir = os.path.join(self._home_dir_mount, 'Downloads') safly.misc.safe_makedirs(download_dir) os.chmod(download_dir, 0750) if self._allow_container_debugging: self._start_extra_debugging_tools()
# Redirect stdout/stderr to the surrogate log for fd_redir_now in (1, 2): try: os.dup2(self._surrogate_log_fd, fd_redir_now) except Exception as e: self.log.error({'error': e, 'fd_src': self._surrogate_log_fd, 'fd_dest': fd_redir_now}, event='log-redirect-failed', opslog=True) raise
os_helpers.secure_exec(config.get('service', 'sv_cr_path'), sv_cr_args, # Close everything but stdin/stdout/stderr. fds_to_leave_open={0, 1, 2}, env=os.environ)
"""Check if the Surrogate is dead, and if so, reap (i.e., wait()) it. This must be done to avoid zombies and to ensure disposal of Surrogate file-system state (e.g., downloads).""" assert(self.pid > 0) # WNOHANG ==> don't block if @pid isn't actually dead. self.log.info({'pid': self.pid, 'status': status, 'exit_code': os.WEXITSTATUS(status), 'term_sig': os.WTERMSIG(status), 'core_dumped': os.WCOREDUMP(status)}, event='reaped')
event='ctrl-sock-connect') assert(fileno == self._ctrl_sock.fileno()) self.kill('control-sock-already-connected') io_loop=self._io_loop)
self.log.info('', event='ctrl-sock-closed') # The assumption here is that the Surrogate won't close its control # socket until it has finished sync'ing any state. self.kill()
"""Setup container routing rules.""" assert(self._netns_handle) assert(self.pid) else: proxy_port = None 'bounce_devices').strip().split(',') # No bounce-all ===> use the default gateway/route. # Auto-select the default device (i.e., the one associated with the # default network route). # Use virtual proxy IP only if upstream proxy is in use. upstream_proxy_config = self._config.get('proxy_config') else: upstream_proxy_config, io_loop=self._io_loop)
('tcp', config.getint('file-server', 'port')), ('tcp', config.getint('policy-enforcement-server', 'port')), # FIXME: SECURITY: not a good idea to give direct # access to Redis API due to large attack surface. # Is this needed in Chromium? ('tcp', config.getint('reporting', 'redis_server_port')), # Surrogate needs to access Safedocs directly # through 10.3.0.1:8081 ('tcp', config.getint('docview', 'port'))] # FIXME: SECURITY: best never to expose services on the host, even for # test deployments. Consider moving the test page web server out of the # deployment. allowed_ports.extend([('tcp', config.getint('director', 'opaque_http_port')), ('tcp', config.getint('director', 'opaque_https_port'))]) 'extra_input_ports').strip() allowed_ports.extend([tuple(tup.split(':')) for tup in input_spec.split(',')])
"""Let Surrogate know that its container has been set up.
This includes sending Surrogate its browser state (e.g., cookies).""" self.log.error({'error': resp.error}, event='browser-state-fetch-failed') else: event='browser-state-loaded') # Let Surrogate know that coupling is complete. It won't start # network activity (e.g., page load) until it gets this. 'f': self._flash_list, 's': self._site_flags} proxy_port = int((self._config['proxy_config']).split(':')[1]) proxy_config = '%s:%s' % (self.SURROGATE_PROXY_IP, proxy_port) else: # TODO: eliminate this path once we get rid of SquidLocal.
"""Get browser state from the BrowserStateServer, relay to Surrogate.
Currently, browser state refers to cookies, but could include LocalStorage content (and more) in the future.""" body=json.dumps({'bid': bid}))
self._send_command({'c': 'p', 'f': pnr_info['flash_list'], 's': pnr_info['site_flags']})
"""Do container setup that needs to be customized on a per user basis.
Examples include routing setup (due to bounce tunnel support), and fetching cookies."""
{'bid': 'redacted'}.items())}, event='got-coupling-request') # Start checking for active connections. self._on_connect_check_timer, config.getint('surrogate_manager', 'connect_check_interval_secs') * 1000) PnrObserver.instance().register( self._config['tenant_id'], self._pnr_update), on_pnr_future_received)
"""Tell BrowserStateServer to persist this Surrogate's state.""" def on_state_sync_completed(resp): """BrowserStateServer has persisted changes, so let Surrogate know.""" if resp.error: self.log.error({'error': resp.error}, event='browser-state-sync-failed') else: self.log.info({'body': resp.body}, event='browser-state-sync-complete') # Let Surrogate know. It may use this as an indication that it is # safe to shut down. self._send_command({'c': 'A', 'r': not resp.error})
http_client = tornado.curl_httpclient.CurlAsyncHTTPClient() url = urlparse.urljoin(STATE_SERVER_URL, '/save') connect_timeout, request_timeout = self.REQ_TIMEOUT_BY_KIND['state-sync'] req = tornado.httpclient.HTTPRequest(url, method='POST', connect_timeout=connect_timeout, request_timeout=request_timeout, body=json.dumps({'bid': self._config['bid']})) http_client.fetch(req, on_state_sync_completed)
"""Surrogate wants us to save browser state for future sessions.""" assert(isinstance(action, list)) def on_resp(resp): if resp.error: self.log.error({'error': resp.error}, event='browser-state-update-failed') # Quite a few of these. self.log.debug({'action': action}, event='browser-state-update') http_client = tornado.curl_httpclient.CurlAsyncHTTPClient() url = urlparse.urljoin(STATE_SERVER_URL, '/push') params = {'bid': self._config['bid'], 'action': action} req = tornado.httpclient.HTTPRequest(url, method='POST', body=json.dumps(params)) http_client.fetch(req, on_resp)
"""Notify event consumer of message from surrogate indicating a resource load."""
# pylint: disable=too-many-branches,too-many-return-statements """Process status update messages coming from the Surrogate.""" except ValueError: self.log.error('msg=%s details="Bad JSON"', msg, event='bad-ctrl-msg') return self.log.error('msg=%s details="Bad JSON"', msg, event='bad-ctrl-msg') return self.log.error('msg=%s details="Missing config"', msg, event='bad-ctrl-msg') return except Exception: self.log.exception({}, event='coupling-setup-failed') self.kill('coupling-setup-threw') if 'a' not in msg: self.log.error('msg=%s details="Missing cookie server command"', msg, event='bad-ctrl-msg') return self._on_browser_state_update_from_surrogate(msg['a']) self._on_state_sync_requested() if 'i' not in msg or 'v' not in msg: if 'i' in msg: future = self._pending_ctrl_commands.pop(msg['i'], None) if future: future.set_exception(RuntimeError( 'Bad response from surrogate')) self.log.error('msg=%s details="Missing id or value in response"', msg, event='bad-ctrl-msg') return future = self._pending_ctrl_commands.pop(msg['i'], None) if not future: self.log.error('msg=%s details="Unknown id in response"', msg, event='bad-ctrl-msg') return future.set_result(msg['v']) self._report_event('tab_limit_exceeded', data={'limit': self._config['tab_limit']}) self.log.info({'limit': self._config['tab_limit']}, event='tab-limit-reached', opslog=True) else: self.log.error('msg=%s details="Unhandled message"', msg, event='bad-ctrl-msg') return
"""Handle the control message sent by the Surrogate.""" except Exception: self.log.exception({'msg': msg}, event='handle-ctrl-message-failed') self.kill('handle-ctrl-message-failed')
"""Parent override: container is dead; clean up Surrogate-specific state, especially that allocated on start().""" # CAREFUL: be sure to deregister the ctrl socket handler since # Python will not GC the Container object while Tornado IOLoop holds # a ref to it. # Not strictly necessary since ctrl_sock will be automatically # closed in class destructor, but I'm keeping this in for # consistency. except Exception: self.log.warning('failed to close control server socket', event='ctrl-sock-close-failed') except Exception: self.log.warning('failed to close control stream', event='ctrl-stream-close-failed') except Exception: self.log.error({'instance': router}, event='router-close-failed')
return { 'container_stats': self.get_stats(), 'config': self._config, 'coupled_time': self._coupled_time, 'creation_time': self._creation_time, 'safe_rid': self.safe_rid, 'state': self._state, 'safe_bid': self.safe_bid, 'version': self._version, 'cr_version': self._cr_version, 'flash_version': self._flash_version, 'pid': self.pid, 'unix_username': self._username, 'cgroup': self._cg_handle.name, 'task_pids': list(self._cg_handle.get_pids()), 'ip': (self._netns_handle.addr if self._netns_handle else 'unknown')}
self.log.warning({}, event='ctrl-sock-already-closed') if response_expected: future = tornado.concurrent.Future() future.set_exception(IOError('no channel')) return future return
future = tornado.concurrent.Future() self._pending_ctrl_commands[cmd_id] = future return future
return self._send_command({'c': 'd'}, True)
def detach_tab(self, tab_id): res = yield self._send_command({'c': 'D', 't': int(tab_id)}, True) self.log.info({'tab_id': tab_id, 'res': res}, event='tab-detach') if res == 'NO_TAB': raise KeyError(tab_id) elif res == 'OK': return else: raise Exception(res)
"""Could happen if browser process gets stuck for whatever reason.""" # TODO: have Surrogate send us keepalive on an auxiliary channel; # the lack of keepalive would indicate something is wrong. This was # super important for WebKit since it didn't implement this # internally; Chromium does, however. pass
# Some stats may not be available on this machine (e.g., # total_swap won't be available if there is no swap configured). continue changed_keys.append(k)
stats.update({'changed_keys': changed_keys}) self.log.info(stats, event='container-stats') if 'total_pgmajfault' in changed_keys: self.log.warning(stats, event='thrashing')
return self.safe_rid |