allocate.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. package chromedp
  2. import (
  3. "bufio"
  4. "bytes"
  5. "context"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "os"
  10. "os/exec"
  11. "path/filepath"
  12. "runtime"
  13. "sync"
  14. "time"
  15. )
  16. // An Allocator is responsible for creating and managing a number of browsers.
  17. //
  18. // This interface abstracts away how the browser process is actually run. For
  19. // example, an Allocator implementation may reuse browser processes, or connect
  20. // to already-running browsers on remote machines.
  21. type Allocator interface {
  22. // Allocate creates a new browser. It can be cancelled via the provided
  23. // context, at which point all the resources used by the browser (such
  24. // as temporary directories) will be freed.
  25. Allocate(context.Context, ...BrowserOption) (*Browser, error)
  26. // Wait blocks until an allocator has freed all of its resources.
  27. // Cancelling the allocator context will already perform this operation,
  28. // so normally there's no need to call Wait directly.
  29. Wait()
  30. }
  31. // setupExecAllocator is similar to NewExecAllocator, but it allows NewContext
  32. // to create the allocator without the unnecessary context layer.
  33. func setupExecAllocator(opts ...ExecAllocatorOption) *ExecAllocator {
  34. ep := &ExecAllocator{
  35. initFlags: make(map[string]interface{}),
  36. wsURLReadTimeout: 20 * time.Second,
  37. }
  38. for _, o := range opts {
  39. o(ep)
  40. }
  41. if ep.execPath == "" {
  42. ep.execPath = findExecPath()
  43. }
  44. return ep
  45. }
  46. // DefaultExecAllocatorOptions are the ExecAllocator options used by NewContext
  47. // if the given parent context doesn't have an allocator set up. Do not modify
  48. // this global; instead, use NewExecAllocator. See [ExampleExecAllocator].
  49. //
  50. // [ExampleExecAllocator]: https://pkg.go.dev/github.com/chromedp/chromedp#example-ExecAllocator
  51. var DefaultExecAllocatorOptions = [...]ExecAllocatorOption{
  52. NoFirstRun,
  53. NoDefaultBrowserCheck,
  54. Headless,
  55. // After Puppeteer's default behavior.
  56. Flag("disable-background-networking", true),
  57. Flag("enable-features", "NetworkService,NetworkServiceInProcess"),
  58. Flag("disable-background-timer-throttling", true),
  59. Flag("disable-backgrounding-occluded-windows", true),
  60. Flag("disable-breakpad", true),
  61. Flag("disable-client-side-phishing-detection", true),
  62. Flag("disable-default-apps", true),
  63. Flag("disable-dev-shm-usage", true),
  64. Flag("disable-extensions", true),
  65. Flag("disable-features", "site-per-process,Translate,BlinkGenPropertyTrees"),
  66. Flag("disable-hang-monitor", true),
  67. Flag("disable-ipc-flooding-protection", true),
  68. Flag("disable-popup-blocking", true),
  69. Flag("disable-prompt-on-repost", true),
  70. Flag("disable-renderer-backgrounding", true),
  71. Flag("disable-sync", true),
  72. Flag("force-color-profile", "srgb"),
  73. Flag("metrics-recording-only", true),
  74. Flag("safebrowsing-disable-auto-update", true),
  75. Flag("enable-automation", true),
  76. Flag("password-store", "basic"),
  77. Flag("use-mock-keychain", true),
  78. }
  79. // NewExecAllocator creates a new context set up with an ExecAllocator, suitable
  80. // for use with NewContext.
  81. func NewExecAllocator(parent context.Context, opts ...ExecAllocatorOption) (context.Context, context.CancelFunc) {
  82. ctx, cancel := context.WithCancel(parent)
  83. c := &Context{Allocator: setupExecAllocator(opts...)}
  84. ctx = context.WithValue(ctx, contextKey{}, c)
  85. cancelWait := func() {
  86. cancel()
  87. c.Allocator.Wait()
  88. }
  89. return ctx, cancelWait
  90. }
  91. // ExecAllocatorOption is an exec allocator option.
  92. type ExecAllocatorOption = func(*ExecAllocator)
  93. // ExecAllocator is an Allocator which starts new browser processes on the host
  94. // machine.
  95. type ExecAllocator struct {
  96. execPath string
  97. initFlags map[string]interface{}
  98. initEnv []string
  99. // Chrome will sometimes fail to print the websocket, or run for a long
  100. // time, without properly exiting. To avoid blocking forever in those
  101. // cases, give up after a specified timeout.
  102. wsURLReadTimeout time.Duration
  103. modifyCmdFunc func(cmd *exec.Cmd)
  104. wg sync.WaitGroup
  105. combinedOutputWriter io.Writer
  106. }
  107. // allocTempDir is used to group all ExecAllocator temporary user data dirs in
  108. // the same location, useful for the tests. If left empty, the system's default
  109. // temporary directory is used.
  110. var allocTempDir string
  111. // Allocate satisfies the Allocator interface.
  112. func (a *ExecAllocator) Allocate(ctx context.Context, opts ...BrowserOption) (*Browser, error) {
  113. c := FromContext(ctx)
  114. if c == nil {
  115. return nil, ErrInvalidContext
  116. }
  117. var args []string
  118. for name, value := range a.initFlags {
  119. switch value := value.(type) {
  120. case string:
  121. args = append(args, fmt.Sprintf("--%s=%s", name, value))
  122. case bool:
  123. if value {
  124. args = append(args, fmt.Sprintf("--%s", name))
  125. }
  126. default:
  127. return nil, fmt.Errorf("invalid exec pool flag")
  128. }
  129. }
  130. removeDir := false
  131. dataDir, ok := a.initFlags["user-data-dir"].(string)
  132. if !ok {
  133. tempDir, err := os.MkdirTemp(allocTempDir, "chromedp-runner")
  134. if err != nil {
  135. return nil, err
  136. }
  137. args = append(args, "--user-data-dir="+tempDir)
  138. dataDir = tempDir
  139. removeDir = true
  140. }
  141. if _, ok := a.initFlags["no-sandbox"]; !ok && os.Getuid() == 0 {
  142. // Running as root, for example in a Linux container. Chrome
  143. // needs --no-sandbox when running as root, so make that the
  144. // default, unless the user set Flag("no-sandbox", false).
  145. args = append(args, "--no-sandbox")
  146. }
  147. if _, ok := a.initFlags["remote-debugging-port"]; !ok {
  148. args = append(args, "--remote-debugging-port=0")
  149. }
  150. // Force the first page to be blank, instead of the welcome page;
  151. // --no-first-run doesn't enforce that.
  152. args = append(args, "about:blank")
  153. cmd := exec.CommandContext(ctx, a.execPath, args...)
  154. defer func() {
  155. if removeDir && cmd.Process == nil {
  156. // We couldn't start the process, so we didn't get to
  157. // the goroutine that handles RemoveAll below. Remove it
  158. // to not leave an empty directory.
  159. os.RemoveAll(dataDir)
  160. }
  161. }()
  162. if a.modifyCmdFunc != nil {
  163. a.modifyCmdFunc(cmd)
  164. } else {
  165. allocateCmdOptions(cmd)
  166. }
  167. stdout, err := cmd.StdoutPipe()
  168. if err != nil {
  169. return nil, err
  170. }
  171. cmd.Stderr = cmd.Stdout
  172. // Preserve environment variables set in the (lowest priority) existing
  173. // environment, OverrideCmdFunc(), and Env (highest priority)
  174. if len(a.initEnv) > 0 || len(cmd.Env) > 0 {
  175. cmd.Env = append(os.Environ(), cmd.Env...)
  176. cmd.Env = append(cmd.Env, a.initEnv...)
  177. }
  178. // We must start the cmd before calling cmd.Wait, as otherwise the two
  179. // can run into a data race.
  180. if err := cmd.Start(); err != nil {
  181. return nil, err
  182. }
  183. select {
  184. case <-ctx.Done():
  185. return nil, ctx.Err()
  186. case <-c.allocated: // for this browser's root context
  187. }
  188. a.wg.Add(1) // for the entire allocator
  189. if a.combinedOutputWriter != nil {
  190. a.wg.Add(1) // for the io.Copy in a separate goroutine
  191. }
  192. go func() {
  193. // First wait for the process to be finished.
  194. // TODO: do we care about this error in any scenario? if the
  195. // user cancelled the context and killed chrome, this will most
  196. // likely just be "signal: killed", which isn't interesting.
  197. cmd.Wait()
  198. // Then delete the temporary user data directory, if needed.
  199. if removeDir {
  200. // Sometimes files/directories are still created in the user data
  201. // directory at this point. I can not reproduce it with strace, so
  202. // the reason is unknown yet. As a workaround, we will just wait a
  203. // little while before removing the directory.
  204. <-time.After(10 * time.Millisecond)
  205. if err := os.RemoveAll(dataDir); c.cancelErr == nil {
  206. c.cancelErr = err
  207. }
  208. }
  209. a.wg.Done()
  210. close(c.allocated)
  211. }()
  212. var wsURL string
  213. wsURLChan := make(chan struct{}, 1)
  214. go func() {
  215. wsURL, err = readOutput(stdout, a.combinedOutputWriter, a.wg.Done)
  216. wsURLChan <- struct{}{}
  217. }()
  218. select {
  219. case <-wsURLChan:
  220. case <-time.After(a.wsURLReadTimeout):
  221. err = errors.New("websocket url timeout reached")
  222. }
  223. if err != nil {
  224. if a.combinedOutputWriter != nil {
  225. // There's no io.Copy goroutine to call the done func.
  226. // TODO: a cleaner way to deal with this edge case?
  227. a.wg.Done()
  228. }
  229. return nil, err
  230. }
  231. browser, err := NewBrowser(ctx, wsURL, opts...)
  232. if err != nil {
  233. return nil, err
  234. }
  235. go func() {
  236. // If the browser loses connection, kill the entire process and
  237. // handler at once. Don't use Cancel, as that will attempt to
  238. // gracefully close the browser, which will hang.
  239. // Don't cancel if we're in the middle of a graceful Close,
  240. // since we want to let Chrome shut itself when it is fully
  241. // finished.
  242. <-browser.LostConnection
  243. select {
  244. case <-browser.closingGracefully:
  245. default:
  246. c.cancel()
  247. }
  248. }()
  249. browser.process = cmd.Process
  250. browser.userDataDir = dataDir
  251. return browser, nil
  252. }
  253. // readOutput grabs the websocket address from chrome's output, returning as
  254. // soon as it is found. All read output is forwarded to forward, if non-nil.
  255. // done is used to signal that the asynchronous io.Copy is done, if any.
  256. func readOutput(rc io.ReadCloser, forward io.Writer, done func()) (wsURL string, _ error) {
  257. prefix := []byte("DevTools listening on")
  258. var accumulated bytes.Buffer
  259. bufr := bufio.NewReader(rc)
  260. readLoop:
  261. for {
  262. line, err := bufr.ReadBytes('\n')
  263. if err != nil {
  264. return "", fmt.Errorf("chrome failed to start:\n%s",
  265. accumulated.Bytes())
  266. }
  267. if forward != nil {
  268. if _, err := forward.Write(line); err != nil {
  269. return "", err
  270. }
  271. }
  272. if bytes.HasPrefix(line, prefix) {
  273. line = line[len(prefix):]
  274. // use TrimSpace, to also remove \r on Windows
  275. line = bytes.TrimSpace(line)
  276. wsURL = string(line)
  277. break readLoop
  278. }
  279. accumulated.Write(line)
  280. }
  281. if forward == nil {
  282. // We don't need the process's output anymore.
  283. rc.Close()
  284. } else {
  285. // Copy the rest of the output in a separate goroutine, as we
  286. // need to return with the websocket URL.
  287. go func() {
  288. io.Copy(forward, bufr)
  289. done()
  290. }()
  291. }
  292. return wsURL, nil
  293. }
  294. // Wait satisfies the Allocator interface.
  295. func (a *ExecAllocator) Wait() {
  296. a.wg.Wait()
  297. }
  298. // ExecPath returns an ExecAllocatorOption which uses the given path to execute
  299. // browser processes. The given path can be an absolute path to a binary, or
  300. // just the name of the program to find via exec.LookPath.
  301. func ExecPath(path string) ExecAllocatorOption {
  302. return func(a *ExecAllocator) {
  303. // Convert to an absolute path if possible, to avoid
  304. // repeated LookPath calls in each Allocate.
  305. if fullPath, _ := exec.LookPath(path); fullPath != "" {
  306. a.execPath = fullPath
  307. } else {
  308. a.execPath = path
  309. }
  310. }
  311. }
  312. // findExecPath tries to find the Chrome browser somewhere in the current
  313. // system. It finds in different locations on different OS systems.
  314. // It could perform a rather aggressive search. That may make it a bit slow,
  315. // but it will only be run when creating a new ExecAllocator.
  316. func findExecPath() string {
  317. var locations []string
  318. switch runtime.GOOS {
  319. case "darwin":
  320. locations = []string{
  321. // Mac
  322. "/Applications/Chromium.app/Contents/MacOS/Chromium",
  323. "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
  324. }
  325. case "windows":
  326. locations = []string{
  327. // Windows
  328. "chrome",
  329. "chrome.exe", // in case PATHEXT is misconfigured
  330. `C:\Program Files (x86)\Google\Chrome\Application\chrome.exe`,
  331. `C:\Program Files\Google\Chrome\Application\chrome.exe`,
  332. filepath.Join(os.Getenv("USERPROFILE"), `AppData\Local\Google\Chrome\Application\chrome.exe`),
  333. filepath.Join(os.Getenv("USERPROFILE"), `AppData\Local\Chromium\Application\chrome.exe`),
  334. }
  335. default:
  336. locations = []string{
  337. // Unix-like
  338. "headless_shell",
  339. "headless-shell",
  340. "chromium",
  341. "chromium-browser",
  342. "google-chrome",
  343. "google-chrome-stable",
  344. "google-chrome-beta",
  345. "google-chrome-unstable",
  346. "/usr/bin/google-chrome",
  347. "/usr/local/bin/chrome",
  348. "/snap/bin/chromium",
  349. "chrome",
  350. }
  351. }
  352. for _, path := range locations {
  353. found, err := exec.LookPath(path)
  354. if err == nil {
  355. return found
  356. }
  357. }
  358. // Fall back to something simple and sensible, to give a useful error
  359. // message.
  360. return "google-chrome"
  361. }
  362. // Flag is a generic command line option to pass a flag to Chrome. If the value
  363. // is a string, it will be passed as --name=value. If it's a boolean, it will be
  364. // passed as --name if value is true.
  365. func Flag(name string, value interface{}) ExecAllocatorOption {
  366. return func(a *ExecAllocator) {
  367. a.initFlags[name] = value
  368. }
  369. }
  370. // Env is a list of generic environment variables in the form NAME=value
  371. // to pass into the new Chrome process. These will be appended to the
  372. // environment of the Go process as retrieved by os.Environ.
  373. func Env(vars ...string) ExecAllocatorOption {
  374. return func(a *ExecAllocator) {
  375. a.initEnv = append(a.initEnv, vars...)
  376. }
  377. }
  378. // ModifyCmdFunc allows for running an arbitrary function on the
  379. // browser exec.Cmd object. This overrides the default version
  380. // of the command which sends SIGKILL to any open browsers when
  381. // the Go program exits.
  382. func ModifyCmdFunc(f func(cmd *exec.Cmd)) ExecAllocatorOption {
  383. return func(a *ExecAllocator) {
  384. a.modifyCmdFunc = f
  385. }
  386. }
  387. // UserDataDir is the command line option to set the user data dir.
  388. //
  389. // Note: set this option to manually set the profile directory used by Chrome.
  390. // When this is not set, then a default path will be created in the /tmp
  391. // directory.
  392. func UserDataDir(dir string) ExecAllocatorOption {
  393. return Flag("user-data-dir", dir)
  394. }
  395. // ProxyServer is the command line option to set the outbound proxy server.
  396. func ProxyServer(proxy string) ExecAllocatorOption {
  397. return Flag("proxy-server", proxy)
  398. }
  399. // IgnoreCertErrors is the command line option to ignore certificate-related
  400. // errors. This option is useful when you need to access an HTTPS website
  401. // through a proxy.
  402. func IgnoreCertErrors(a *ExecAllocator) {
  403. Flag("ignore-certificate-errors", true)(a)
  404. }
  405. // WindowSize is the command line option to set the initial window size.
  406. func WindowSize(width, height int) ExecAllocatorOption {
  407. return Flag("window-size", fmt.Sprintf("%d,%d", width, height))
  408. }
  409. // UserAgent is the command line option to set the default User-Agent
  410. // header.
  411. func UserAgent(userAgent string) ExecAllocatorOption {
  412. return Flag("user-agent", userAgent)
  413. }
  414. // NoSandbox is the Chrome command line option to disable the sandbox.
  415. func NoSandbox(a *ExecAllocator) {
  416. Flag("no-sandbox", true)(a)
  417. }
  418. // NoFirstRun is the Chrome command line option to disable the first run
  419. // dialog.
  420. func NoFirstRun(a *ExecAllocator) {
  421. Flag("no-first-run", true)(a)
  422. }
  423. // NoDefaultBrowserCheck is the Chrome command line option to disable the
  424. // default browser check.
  425. func NoDefaultBrowserCheck(a *ExecAllocator) {
  426. Flag("no-default-browser-check", true)(a)
  427. }
  428. // Headless is the command line option to run in headless mode. On top of
  429. // setting the headless flag, it also hides scrollbars and mutes audio.
  430. func Headless(a *ExecAllocator) {
  431. Flag("headless", true)(a)
  432. // Like in Puppeteer.
  433. Flag("hide-scrollbars", true)(a)
  434. Flag("mute-audio", true)(a)
  435. }
  436. // DisableGPU is the command line option to disable the GPU process.
  437. //
  438. // The --disable-gpu option is a temporary workaround for a few bugs
  439. // in headless mode. According to the references below, it's no longer required:
  440. // - https://bugs.chromium.org/p/chromium/issues/detail?id=737678
  441. // - https://github.com/puppeteer/puppeteer/pull/2908
  442. // - https://github.com/puppeteer/puppeteer/pull/4523
  443. //
  444. // But according to this reported issue, it's still required in some cases:
  445. // - https://github.com/chromedp/chromedp/issues/904
  446. func DisableGPU(a *ExecAllocator) {
  447. Flag("disable-gpu", true)(a)
  448. }
  449. // CombinedOutput is used to set an io.Writer where stdout and stderr
  450. // from the browser will be sent
  451. func CombinedOutput(w io.Writer) ExecAllocatorOption {
  452. return func(a *ExecAllocator) {
  453. a.combinedOutputWriter = w
  454. }
  455. }
  456. // WSURLReadTimeout sets the waiting time for reading the WebSocket URL.
  457. // The default value is 20 seconds.
  458. func WSURLReadTimeout(t time.Duration) ExecAllocatorOption {
  459. return func(a *ExecAllocator) {
  460. a.wsURLReadTimeout = t
  461. }
  462. }
  463. // NewRemoteAllocator creates a new context set up with a RemoteAllocator,
  464. // suitable for use with NewContext. The url should point to the browser's
  465. // websocket address, such as "ws://127.0.0.1:$PORT/devtools/browser/...".
  466. //
  467. // If the url does not contain "/devtools/browser/", it will try to detect
  468. // the correct one by sending a request to "http://$HOST:$PORT/json/version".
  469. //
  470. // The url with the following formats are accepted:
  471. // - ws://127.0.0.1:9222/
  472. // - http://127.0.0.1:9222/
  473. //
  474. // But "ws://127.0.0.1:9222/devtools/browser/" are not accepted.
  475. // Because the allocator won't try to modify it and it's obviously invalid.
  476. //
  477. // Use chromedp.NoModifyURL to prevent it from modifying the url.
  478. func NewRemoteAllocator(parent context.Context, url string, opts ...RemoteAllocatorOption) (context.Context, context.CancelFunc) {
  479. a := &RemoteAllocator{
  480. wsURL: url,
  481. modifyURLFunc: modifyURL,
  482. }
  483. for _, o := range opts {
  484. o(a)
  485. }
  486. c := &Context{Allocator: a}
  487. ctx, cancel := context.WithCancel(parent)
  488. ctx = context.WithValue(ctx, contextKey{}, c)
  489. return ctx, cancel
  490. }
  491. // RemoteAllocatorOption is a remote allocator option.
  492. type RemoteAllocatorOption = func(*RemoteAllocator)
  493. // RemoteAllocator is an Allocator which connects to an already running Chrome
  494. // process via a websocket URL.
  495. type RemoteAllocator struct {
  496. wsURL string
  497. modifyURLFunc func(ctx context.Context, wsURL string) (string, error)
  498. wg sync.WaitGroup
  499. }
  500. // Allocate satisfies the Allocator interface.
  501. func (a *RemoteAllocator) Allocate(ctx context.Context, opts ...BrowserOption) (*Browser, error) {
  502. c := FromContext(ctx)
  503. if c == nil {
  504. return nil, ErrInvalidContext
  505. }
  506. wsURL := a.wsURL
  507. var err error
  508. if a.modifyURLFunc != nil {
  509. wsURL, err = a.modifyURLFunc(ctx, wsURL)
  510. if err != nil {
  511. return nil, fmt.Errorf("failed to modify wsURL: %w", err)
  512. }
  513. }
  514. // Use a different context for the websocket, so we can have a chance at
  515. // closing the relevant pages before closing the websocket connection.
  516. wctx, cancel := context.WithCancel(context.Background())
  517. close(c.allocated)
  518. a.wg.Add(1) // for the entire allocator
  519. go func() {
  520. <-ctx.Done()
  521. Cancel(ctx) // block until all pages are closed
  522. cancel() // close the websocket connection
  523. a.wg.Done()
  524. }()
  525. browser, err := NewBrowser(wctx, wsURL, opts...)
  526. if err != nil {
  527. return nil, err
  528. }
  529. go func() {
  530. // If the browser loses connection, kill the entire process and
  531. // handler at once.
  532. <-browser.LostConnection
  533. select {
  534. case <-browser.closingGracefully:
  535. default:
  536. Cancel(ctx)
  537. }
  538. }()
  539. return browser, nil
  540. }
  541. // Wait satisfies the Allocator interface.
  542. func (a *RemoteAllocator) Wait() {
  543. a.wg.Wait()
  544. }
  545. // NoModifyURL is a RemoteAllocatorOption that prevents the remote allocator
  546. // from modifying the websocket debugger URL passed to it.
  547. func NoModifyURL(a *RemoteAllocator) {
  548. a.modifyURLFunc = nil
  549. }