-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfetch-comments.sh
More file actions
executable file
·229 lines (186 loc) · 7.45 KB
/
fetch-comments.sh
File metadata and controls
executable file
·229 lines (186 loc) · 7.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/bin/bash
#TODO: these should really be passed as options
if [ -z "$SILENT" ]; then
SILENT="false"
fi
if [ -z "${PREFIX_URL}" ]; then
PREFIX_URL="false"
fi
if [ -z "${MAX_PAGES}" ]; then
MAX_PAGES=""
fi
if [ -z "${HEADER_ACCEPT}" ]; then
HEADER_ACCEPT=""
fi
if [ -z "${ZERO_TERMINATE}" ]; then
ZERO_TERMINATE="false"
fi
if [ -z "${BASE64}" ]; then
BASE64="false"
fi
if [ -z "${BEST_EFFORT}" ]; then
BEST_EFFORT="false"
fi
if [ -z "${RETRY_MAX_COUNT}" ]; then
RETRY_MAX_COUNT=""
fi
if [ -z "${MAX_TIME}" ]; then
MAX_TIME=""
else
MAX_TIME="--max-time ${MAX_TIME}"
fi
set -eu -o pipefail
# cross-OS compatibility (greadlink, gsed, gzcat are GNU implementations for OS X)
[[ $(uname) == 'Darwin' ]] && {
shopt -s expand_aliases
which greadlink gsed gzcat gjoin gmktemp gdate gwc > /dev/null && {
unalias readlink sed zcat join mktemp date wc >/dev/null 2>/dev/null
alias readlink=greadlink sed=gsed zcat=gzcat join=gjoin mktemp=gmktemp date=gdate wc=gwc
} || {
echo 'ERROR: GNU utils required for Mac. You may use homebrew to install them: brew install coreutils gnu-sed'
exit 1
}
}
#make sure we can recover some info if we die in the middle of fetching data
trap 'echo "[ERROR] Error occurred at $BASH_SOURCE:$LINENO command: $BASH_COMMAND exit: $?" > /dev/stderr' ERR
#assume env var "GITHUB_TOKEN" set
# output to standard out, uncomment lines below (plus more echo to stdout below)
# to output a to file
# let user pass some stuff in
#if [ -z "$ALLCOMMENTS" ]; then
# ALLCOMMENTS=$(mktemp --tmpdir allcomments.XXXXXXXXXX)
#fi
#echo "writing comments to ${ALLCOMMENTS}" > /dev/stderr
#if [ -z "$1" ]; then
# OWNER="rails"
# REPO="rails"
# NEXTURL="https://api.github.com/repos/${OWNER}/${REPO}/pulls/comments?since=1970-01-01T00:00:00Z&per_page=100"
#else
# NEXTURL="$1"
#fi
RETRY_TIME=1
if [ -z "$RETRY_MAX_COUNT" ]; then
RETRY_MAX_COUNT=8
fi
#a fifo for tracking progress for each input url
PV_PIDFILE=$(mktemp -u --tmpdir autodev_fetch_pvpid.XXXXXXXXXX)
FETCH_FIFO=$(mktemp -u --tmpdir autodev_fetch_fifo.XXXXXXXXXX)
$SILENT || mkfifo $FETCH_FIFO
if $BASE64; then
ENCODER="base64 -w0"
else
ENCODER=cat
fi
while read NEXTURL; do
$SILENT || echo "fetching ${NEXTURL}" > /dev/stderr
#HEADERS=/tmp/headers.last
HEADERS=$(mktemp --tmpdir headers.XXXXXXXXXX)
#COMMENTS=/tmp/comments.last
COMMENTS=$(mktemp --tmpdir comments.XXXXXXXXXX)
TOTALPAGES=""
#set up a progress meter for this input url
$SILENT || pv -P $PV_PIDFILE -l $FETCH_FIFO > /dev/null &
$SILENT || exec 3>$FETCH_FIFO
pages=0
retry_count=0
retry_sleep=$RETRY_TIME
while [ ! -z "${NEXTURL}" ]; do
if [ ! -z "${MAX_PAGES}" ]; then
if [[ "${pages}" -ge "${MAX_PAGES}" ]]; then
break;
fi
fi
if (! curl ${MAX_TIME} --http1.1 -L --compressed -s -D ${HEADERS} -H "Authorization: token ${GITHUB_TOKEN}" -H "Accept: ${HEADER_ACCEPT}" "${NEXTURL}" > ${COMMENTS} ) || grep -E --silent '^HTTP/[^ ]+ +5[0-9][0-9]' ${HEADERS}; then
echo "error ${PIPESTATUS[0]}" > /dev/stderr
#handle server errors with retry
#we do this manually to avoid polluting the output with server
#error output
if [ "$retry_count" -ge "$RETRY_MAX_COUNT" ]; then
echo "exceeded max retry count ${retry_count} on ${NEXTURL}" > /dev/stderr
if $BEST_EFFORT; then
echo "skipping ${NEXTURL}" > /dev/stderr
NEXTURL=""
continue
else
exit 1;
fi
fi
retry_count=$(( $retry_count + 1 ))
retry_sleep=$(( $retry_sleep * 2 ))
sleep ${retry_sleep}
elif grep --silent 'HTTP/1.1 403' ${HEADERS}; then
retry_count=0
retry_sleep=$RETRY_TIME
if grep -q '^[Xx]-[Rr]ate[Ll]imit-[Rr]eset: [0-9]*' ${HEADERS}; then
#handle rate limiting
echo "rate limit" > /dev/stderr
reset_time=$(grep '^[Xx]-[Rr]ate[Ll]imit-[Rr]eset: [0-9]*' ${HEADERS} | sed 's/^[Xx]-[Rr]ate[Ll]imit-[Rr]eset: \([0-9]*\).*/\1/')
grep '[Xx]-[Rr]ate[Ll]imit-' ${HEADERS} > /dev/stderr
sleeptime=$(( $(( ${reset_time} - $(date +%s) )) + 10 ))
echo "sleeping $sleeptime" > /dev/stderr
sleep ${sleeptime}
elif ${BEST_EFFORT} && (cat ${COMMENTS} | head -c256 | grep -q "error: too big or took too long to generate"); then
#sometimes on diffs we get a 403 with "error: too big"
echo "error too big, skipping ${NEXTURL}" > /dev/stderr
NEXTURL=""
continue
else
echo "unknown error, check ${HEADERS} and ${COMMENTS}" > /dev/stderr
exit 1
fi
#TODO: I don't think we should support 404 at all. no results should just be an empty array, not a 404
# 404 can come up with the diff endpoints when then diff is unavailable for some reason
#check if there simply are no results
elif grep --silent 'HTTP/1.1 404' ${HEADERS}; then
echo "no results for ${NEXTURL}" > /dev/stderr
if [ -z "${TOTALPAGES}" ]; then
exit
else
echo "this is unexpected as we should have ${TOTALPAGES} pages" > /dev/stderr
exit 1
fi
else
retry_count=0
retry_sleep=$RETRY_TIME
#check if there was some other error
if ! grep --silent 'HTTP/1.1 200' ${HEADERS}; then
echo "got bad status code, see ${HEADERS} for details" > /dev/stderr
exit 1
fi
#optionally prefix with the url we just fetched
! ${PREFIX_URL} || printf "%s\t" ${NEXTURL}
if ${ZERO_TERMINATE} || ${BASE64}; then
cat ${COMMENTS} | $ENCODER #>> ${ALLCOMMENTS}
else
cat ${COMMENTS} | tr -d '\n' | $ENCODER #>> ${ALLCOMMENTS}
fi
if ${ZERO_TERMINATE}; then
echo -ne '\0'
else
echo "" #>> ${ALLCOMMENTS}
fi
NEXTURL=$(cat ${HEADERS} | grep '^[Ll]ink: ' | tr ',' '\n' | grep 'rel="next"' | sed 's/.*<\([^>]*\).*/\1/' || true)
if [ -z "${TOTALPAGES}" ]; then
TOTALPAGES=$(cat ${HEADERS} | grep '^[Ll]ink: ' | tr ',' '\n' | grep 'rel="last"' | sed 's/.*<\([^>]*\).*/\1/' | sed 's/.*page=\([0-9]*\).*/\1/' | sed 's/.*page=\([0-9]*\).*/\1/' || echo "1")
EXPECTED_PAGES=${TOTALPAGES}
if [ ! -z "${MAX_PAGES}" ]; then
EXPECTED_PAGES=$(( ${MAX_PAGES} > ${TOTALPAGES} ? ${TOTALPAGES} : ${MAX_PAGES} ))
fi
$SILENT || echo "${TOTALPAGES} pages" > /dev/stderr
#allow this pv to fail without killing the script
#this might happen on Cygwin or WSL
$SILENT || pv -R $(cat $PV_PIDFILE) -s ${EXPECTED_PAGES} || true
fi
$SILENT || echo '.' > ${FETCH_FIFO}
fi
pages=$(( $pages + 1 ))
done
#close/cleanup the progress meter
$SILENT || exec 3>&-
#clean up state for this url
#these will be left in place if something went wrong
rm $HEADERS
rm $COMMENTS
$SILENT || echo "done" > /dev/stderr
done
$SILENT || rm $FETCH_FIFO