-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplaybook-ubuntu-setup.yml
271 lines (239 loc) · 8.75 KB
/
playbook-ubuntu-setup.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
---
# Define variables for this playbook:
- hosts: all
tasks:
- set_fact:
# This must be an absolute path to use with 'unarchive', otherwise it
# will break in non-obvious ways.
crawler_dir: "{{ ansible_env.HOME }}/iframe-crawler"
- name: Ensure ansible's 'apt' module can be used with this host
hosts: all
become: yes
tasks:
- name: Install ansible's apt module's remote prerequisites
# Unfornutanely, using 'command' means that these steps will always have
# the state 'changed' regardless of whether any changes were made.
# Maybe install python-apt too?
command: "{{ item }}"
loop:
# DigitalOcean's apt needs an update first to find packages (aptitude)
- apt update
- apt upgrade -y
- apt install -y python3-apt aptitude
- name: Ensure unzip in installed for the 'unarchive' module
hosts: all
become: yes
gather_facts: no
tasks:
- name: Install unzip
apt:
name: unzip
state: latest
- name: Ensure we have the crawler repository
hosts: all
gather_facts: no
tasks:
- name: Clone/update the crawler repository
git:
repo: https://github.com/jwatt/iframe-crawler.git
dest: iframe-crawler
# I've used some JavaScript language features that don't work in Node.js v10,
# such as `for await`. Ubuntu 19.10 Server has Node.js v10 (originally
# released on 2018-04-24) installed, and Ubuntu 18.04 only has Node.js v8 (and
# it's not installed by default), so we download our own more up to date Node.js
# here.
#
# TODO Consiber using `vars_prompt` to ask whether we can install a newer
# version system wide.
# https://stackoverflow.com/questions/25466675/ansible-to-conditionally-prompt-for-a-variable
#
# nodejs.org docs say to use the NodeSource distribution on Debian/Ubuntu if
# installing from a package manager:
# https://nodejs.org/en/download/package-manager/#debian-and-ubuntu-based-linux-distributions-enterprise-linux-fedora-and-snap-packages
#
- name: Install a local up to date version of Node.js
hosts: all
gather_facts: no
tasks:
- name: Get version number of Node.js, if installed
ignore_errors: yes
command: node --version
register: installed_nodejs_info
# To get the major version as an integer, we could use:
# int(installed_nodejs_info.stdout[1:].split(".")[0])
#- name: Skip the rest of this play if Node.js is already installed
# when: not installed_nodejs_info.failed
# meta: end_host
- name: Get the version number of the latest Node.js LTS
uri:
url: "https://api.github.com/repos/nodejs/node/releases/latest"
method: GET
return_content: yes
status_code: 200
headers:
Content-Type: "application/json"
body_format: json
register: latest_nodejs_info
- name: Download latest Node.js
unarchive:
src: "https://nodejs.org/dist/latest/node-{{ node_version_str }}-linux-x64.tar.gz"
remote_src: yes
dest: "{{ crawler_dir }}"
extra_opts:
- "--transform"
- "s/node-{{ node_version_str }}-linux-x64/node/"
creates: "{{ crawler_dir }}/node"
vars:
node_version_str: "{{ latest_nodejs_info.json.tag_name }}"
- name: Create a symbolic link for nodejs
file:
src: "node"
dest: "{{ crawler_dir }}/node/bin/nodejs"
state: link
mode: "744"
# If installing using the apt package manager, we'd do something like the
# following.
#
#- name: Download NodeSource install script
# get_url:
# url: https://deb.nodesource.com/setup_{{ latest_node_lts_major_version_num }}.x
# dest: "{{ crawler_dir }}/nodesource-setup-script.bash"
# mode: "+x"
# vars:
# node_version_str: "{{ latest_nodejs_info.json.tag_name }}"
# # LTS versions have even version numbers
# latest_node_lts_major_version_num: "{{ round(int(latest_nodejs_info.json.tag_name[1:].split('.')[0]) / 2) * 2 }}"
#
#- name: Run NodeSource install script
# become: yes
# # Using recursive sudo here in addition to 'become: yes' for the -E flag.
# shell: cat nodesource-setup-script.bash | sudo -E bash -
# args:
# chdir: "{{ crawler_dir }}"
# executable: /bin/bash
#
#- name: Install Node.js
# apt:
# name: nodejs
# state: latest
- name: Ensure webdriver.io in installed
hosts: all
gather_facts: no
tasks:
- name: Install webdriver.io
npm:
name: webdriverio
global: no
# Install into 'node_modules' in our crawler directory:
path: "{{ crawler_dir }}"
environment:
# Do we actually need this to invoke our local 'npm'?
PATH: "{{ crawler_dir }}/node/bin:{{ ansible_env.PATH }}"
- name: Get geckodriver
hosts: all
gather_facts: no
tasks:
# Maybe someday the github redirector[*] will allow redirecting to the
# latest assets even when they contain a version number in their name.
# For now, we need to get the version number manually and insert it into
# the URL for the .tar.gz that we want to download.
#
# * https://help.github.com/en/github/administering-a-repository/linking-to-releases
#
- name: Get latest geckodriver release tag
uri:
url: "https://api.github.com/repos/mozilla/geckodriver/releases/latest"
method: GET
return_content: yes
status_code: 200
headers:
Content-Type: "application/json"
body_format: json
register: geckodriver_info
- name: Fetch and extract geckodriver
unarchive:
src: "https://github.com/mozilla/geckodriver/releases/latest/download/geckodriver-{{ geckodriver_info.json.tag_name }}-linux64.tar.gz"
remote_src: yes
dest: "{{ crawler_dir }}"
creates: "{{ crawler_dir }}/geckodriver"
- name: Allow localhost connections on port 4444
ufw:
rule: allow
from_ip: 127.0.0.1
from_port: "4444"
to_ip: 127.0.0.1
to_port: "4444"
proto: tcp
become: yes
- name: Ensure the Alexa, Cisco and Majestic top sites lists are present
hosts: all
gather_facts: no
tasks:
- name: Get the Alexa top 1M list
unarchive:
src: https://s3.amazonaws.com/alexa-static/top-1m.csv.zip
remote_src: yes
dest: "{{ crawler_dir }}"
# Kinda lying here (see rename in next step):
creates: "{{ crawler_dir }}/alexa-top-1m.csv"
register: alexa1Mstate
- name: Rename top-1m.csv to alexa-top-1m.csv
command: mv top-1m.csv alexa-top-1m.csv
args:
chdir: "{{ crawler_dir }}"
when: alexa1Mstate.changed
- name: Create Alexa top 1k file
shell: head -1000 alexa-top-1m.csv > alexa-top-1k.csv
args:
chdir: "{{ crawler_dir }}"
when: alexa1Mstate.changed
- name: Get the Cisco top 1M list
unarchive:
src: https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip
remote_src: yes
dest: "{{ crawler_dir }}"
# Kinda lying here (see rename in next step):
creates: "{{ crawler_dir }}/cisco-top-1m.csv"
register: cisco1Mstate
- name: Rename top-1m.csv to cisco-top-1m.csv
command: mv top-1m.csv cisco-top-1m.csv
args:
chdir: "{{ crawler_dir }}"
when: cisco1Mstate.changed
- name: Create Cisco top 1k file
shell: head -1000 cisco-top-1m.csv > cisco-top-1k.csv
args:
chdir: "{{ crawler_dir }}"
when: cisco1Mstate.changed
- name: Get the Majestic top 1M list
# majestic_million.csv.gz exists, but doesn't appear to play nicely with
# unarchive, so we download the non-compressed file here.
get_url:
url: https://downloads.majestic.com/majestic_million.csv
dest: "{{ crawler_dir }}/majestic-top-1m.csv"
register: majestic1Mstate
- name: Create Majestic top 1k file
shell: head -1000 majestic-top-1m.csv > majestic-top-1k.csv
args:
chdir: "{{ crawler_dir }}"
when: majestic1Mstate.changed
- name: Ensure we have Nightly Firefox
hosts: all
gather_facts: no
tasks:
# We intall Firefox to get all the many lib dependencies (such as
# libgtk-3-0) which we need even if we don't run this version.
- name: Ensure Firefox is installed
apt:
name: firefox
state: latest
become: yes
- name: Download Nightly Firefox
unarchive:
# Discussion about version number in links: https://bugzilla.mozilla.org/show_bug.cgi?id=1595045
src: https://download.mozilla.org/?product=firefox-nightly-latest&os=linux64
remote_src: yes
dest: "{{ crawler_dir }}"
creates: "{{ crawler_dir }}/firefox"
# Then to use, set:
# export IFRAME_CRAWLER_FIREFOX_BIN="$PWD/firefox/firefox"