add hints about how to address common errors. new hints can be added as
[bootcd.git] / initscripts / pl_boot
1 #!/bin/bash
2
3 . /tmp/planet.cnf
4
5 # Run gpg once to create default options
6 GNUPGHOME=/root
7 export GNUPGHOME
8 /usr/bin/gpg --yes 2>/dev/null </dev/null
9
10 # if this file is present, cancel the boot (exit this script)
11 CANCEL_BOOT_FLAG=/tmp/CANCEL_BOOT
12
13 # how many times to fail in attempting to contact primary server
14 # before falling back to original. if the backup fails this many times
15 # too, then the process is repeated started with the primary server
16 ATTEMPTS_BEFORE_BACKUP=3
17
18 # where all the configuration files for contacting
19 # the boot server are stored
20 BOOT_DIR=/usr/boot/
21
22 # get the server we are going to be contacting
23 BOOT_SERVER=`cat $BOOT_DIR/boot_server`
24 BOOT_SERVER_PORT=`cat $BOOT_DIR/boot_server_port`
25
26 # the file to request from the boot server
27 BOOT_SERVER_PATH=`cat $BOOT_DIR/boot_server_path`
28
29 # location of the cacert for this boot server
30 BOOT_SERVER_CACERT=$BOOT_DIR/cacert.pem
31
32 # location of the gpg key ring to verify scripts
33 BOOT_SERVER_GPG_KEYRING=$BOOT_DIR/pubring.gpg
34
35 # get the backup server we are going to be contacting
36 BACKUP_BOOT_SERVER=`cat $BOOT_DIR/backup/boot_server`
37 BACKUP_BOOT_SERVER_PORT=`cat $BOOT_DIR/backup/boot_server_port`
38
39 # the file to request from the backup boot server
40 BACKUP_BOOT_SERVER_PATH=`cat $BOOT_DIR/backup/boot_server_path`
41
42 # location of the cacert for the backup boot server
43 BACKUP_BOOT_SERVER_CACERT=$BOOT_DIR/backup/cacert.pem
44
45 # location of the gpg key ring for backup server to verify scripts
46 BACKUP_BOOT_SERVER_GPG_KEYRING=$BOOT_DIR/backup/pubring.gpg
47
48 # location of a file containing this boot cd version
49 BOOT_VERSION_FILE=/pl_version
50
51 # the locations of the downloaded scripts
52 UNVERIFIED_SCRIPT=/tmp/bootscript.gpg
53 VERIFIED_SCRIPT=/tmp/bootscript
54
55
56 # --------------------------
57
58
59 # now, contact the boot server, run the script, and do it over again.
60 contact_count=0
61
62 # set to one when we are trying to contact backup server
63 on_backup_server=0
64
65 # start out contacting the primary servers
66 CONNECT_BOOT_SERVER=$BOOT_SERVER
67 CONNECT_BOOT_SERVER_PORT=$BOOT_SERVER_PORT
68 CONNECT_BOOT_SERVER_PATH=$BOOT_SERVER_PATH
69 CONNECT_BOOT_SERVER_GPG_KEYRING=$BOOT_SERVER_GPG_KEYRING
70 CONNECT_BOOT_SERVER_CACERT=$BOOT_SERVER_CACERT
71
72 while : ; do
73
74     if [[ -f $CANCEL_BOOT_FLAG ]]; then
75         echo $(date "+%H:%M:%S") " pl_boot: got request to cancel boot, exiting"
76         exit 0
77     fi
78     
79     if [[ $contact_count -ge $ATTEMPTS_BEFORE_BACKUP ]]; then
80
81         contact_count=0
82
83         if [[ $on_backup_server == 1 ]]; then
84             echo $(date "+%H:%M:%S") " pl_boot: failed to contact backup server, trying primary $BOOT_SERVER"
85
86             on_backup_server=0
87
88             CONNECT_BOOT_SERVER=$BOOT_SERVER
89             CONNECT_BOOT_SERVER_PORT=$BOOT_SERVER_PORT
90             CONNECT_BOOT_SERVER_PATH=$BOOT_SERVER_PATH
91             CONNECT_BOOT_SERVER_GPG_KEYRING=$BOOT_SERVER_GPG_KEYRING
92             CONNECT_BOOT_SERVER_CACERT=$BOOT_SERVER_CACERT
93         else
94             echo $(date "+%H:%M:%S") " pl_boot: failed to contact primary server, trying backup $BACKUP_BOOT_SERVER"
95
96             on_backup_server=1
97
98             CONNECT_BOOT_SERVER=$BACKUP_BOOT_SERVER
99             CONNECT_BOOT_SERVER_PORT=$BACKUP_BOOT_SERVER_PORT
100             CONNECT_BOOT_SERVER_PATH=$BACKUP_BOOT_SERVER_PATH
101             CONNECT_BOOT_SERVER_GPG_KEYRING=$BACKUP_BOOT_SERVER_GPG_KEYRING
102             CONNECT_BOOT_SERVER_CACERT=$BACKUP_BOOT_SERVER_CACERT
103         fi
104     fi
105
106     if [[ $contact_count != 0 ]]; then
107
108         if [[ $on_backup_server == 1 ]]; then
109             echo $(date "+%H:%M:%S") " pl_boot: attempting to fetch script from backup server in 30s"
110         else
111             echo $(date "+%H:%M:%S") " pl_boot: attempting to fetch script from primary server in 30s"
112         fi
113         /bin/sleep 30
114     fi
115
116     # assemble the curl transaction
117     CURL_CMD="/usr/bin/curl \
118         --connect-timeout 60 \
119         --max-time 600 \
120         --form version=<$BOOT_VERSION_FILE \
121         --form cmdline=</proc/cmdline \
122         --form uptime=</proc/uptime \
123         --form ifconfig=</tmp/ifconfig \
124         --form nonce=</tmp/nonce \
125         --location \
126         --output $UNVERIFIED_SCRIPT \
127         --sslv3  \
128         --silent \
129         --show-error \
130         --fail \
131         --stderr /tmp/curl_errors \
132         --cacert $CONNECT_BOOT_SERVER_CACERT \
133    https://$CONNECT_BOOT_SERVER:$CONNECT_BOOT_SERVER_PORT/$CONNECT_BOOT_SERVER_PATH"
134
135     # assemble the gpg command line
136     GPG_CMD="/usr/bin/gpg \
137         --no-default-keyring \
138         --keyring $CONNECT_BOOT_SERVER_GPG_KEYRING \
139         --output $VERIFIED_SCRIPT \
140         --always-trust \
141         --decrypt $UNVERIFIED_SCRIPT"
142
143     echo $(date "+%H:%M:%S") " pl_boot: generating new nonce"
144     /usr/bin/head --bytes=32 /dev/urandom | \
145     /usr/bin/od -tx1 -An --width=32 | \
146     /bin/sed 's/ //g' > /tmp/nonce
147
148     echo $(date "+%H:%M:%S") " pl_boot: fetching script from boot server $CONNECT_BOOT_SERVER"
149     ((contact_count++))
150     rm -f $UNVERIFIED_SCRIPT
151     $CURL_CMD
152     curl_err=$?
153     if [ $curl_err -ne 0 ]; then
154         echo $(date "+%H:%M:%S") " pl_boot: curl request failed with error $curl_err:"
155         cat /tmp/curl_errors
156         echo
157         if [ -n "$DISCONNECTED_OPERATION" ]; then
158             mkdir /tmp/boot-media
159             mount -U "$DISCONNECTED_OPERATION" /tmp/boot-media
160             cp /tmp/boot-media/bootscript.gpg $UNVERIFIED_SCRIPT
161             umount /tmp/boot-media
162             rmdir /tmp/boot-media
163         else
164                         case $curl_err in
165                                 6)
166                                 echo $(date "+%H:%M:%S") " This error likely indicates a networking configuration error. "
167                                 echo $(date "+%H:%M:%S") " Please, check whether you can ping this machine.  If you can, "
168                                 echo $(date "+%H:%M:%S") " we recommend checking your DNS settings.  If you cannot, then "
169                                         echo $(date "+%H:%M:%S") " please double check your network settings registered at PLC and "
170                                         echo $(date "+%H:%M:%S") " stored on this Boot Image."
171                                         for file in `ls /etc/sysconfig/network-scripts/ifcfg-eth*` ; do 
172                                                 echo $(date "+%H:%M:%S") $file :
173                                                 cat $file
174                                         done
175                                 ;;
176                                 60)
177                                 echo $(date "+%H:%M:%S") " This error likely indicates that the hardware clock is not set "
178                                         echo $(date "+%H:%M:%S") " to GMT.  The result is that authentication between the local and "
179                                         echo $(date "+%H:%M:%S") " remote site fails.  Please double check this machine's system "
180                                         echo $(date "+%H:%M:%S") " clock, and set it to GMT in the BIOS. If after rebooting the same "
181                                         echo $(date "+%H:%M:%S") " error occurs, please report the situation to support@planet-lab.org "
182                                 echo $(date "+%H:%M:%S") " with as much detail as possible."
183                                 ;;
184                                 *)
185                                 echo $(date "+%H:%M:%S") " The best-practice approach for handling this error is not yet "
186                                 echo $(date "+%H:%M:%S") " documented.  Please report this error to support@planet-lab.org "
187                                 echo $(date "+%H:%M:%S") " with as much detail as possible."
188                                 ;;
189                         esac
190             continue
191         fi
192     elif [ -n "$DISCONNECTED_OPERATION" ]; then
193         mkdir /tmp/boot-media
194         mount -U "$DISCONNECTED_OPERATION" /tmp/boot-media
195         cp $UNVERIFIED_SCRIPT /tmp/boot-media
196         umount /tmp/boot-media
197         rmdir /tmp/boot-media
198     fi 
199
200     echo $(date "+%H:%M:%S") " pl_boot: verifying downloaded script"
201     rm -f $VERIFIED_SCRIPT
202     $GPG_CMD 2> /tmp/gpg_errors
203     if [ $? -ne 0 ]; then
204         echo $(date "+%H:%M:%S") " pl_boot: failed to verify file:"
205         cat /tmp/gpg_errors
206         echo
207         continue
208     fi
209     echo $(date "+%H:%M:%S") " pl_boot: decrypted and verified script succesfully"
210
211     echo $(date "+%H:%M:%S") " pl_boot: handing control to download script"
212     rm -f $UNVERIFIED_SCRIPT
213     chmod +x $VERIFIED_SCRIPT
214     $VERIFIED_SCRIPT
215     
216     echo $(date "+%H:%M:%S") " pl_boot: downloaded script has returned"
217 done
218
219 echo $(date "+%H:%M:%S") " pl_boot: automatic boot process canceled by user"