From 5ae8a8bcfd563ef0d472d0560beba53798897e3c Mon Sep 17 00:00:00 2001 From: Ben Parsons Date: Tue, 7 Apr 2020 14:37:55 +0100 Subject: [PATCH 1/2] fix pagination in scraper --- scripts/proposals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/proposals.py b/scripts/proposals.py index 27cc6cfb..96904ce4 100755 --- a/scripts/proposals.py +++ b/scripts/proposals.py @@ -38,7 +38,7 @@ def getpage(url): pagecount = 1 for link in resp.links.values(): if link['rel'] == 'last': - pagecount = int(re.search('page=(.+?)', link['url']).group(1)) + pagecount = int(re.search('page=(.+)', link['url']).group(1)) val = resp.json() if not isinstance(val, list): From 7c037d2490203d156e9def41f43ce255d863d48b Mon Sep 17 00:00:00 2001 From: Ben Parsons Date: Tue, 7 Apr 2020 15:26:48 +0100 Subject: [PATCH 2/2] improve capture and add example --- scripts/proposals.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/proposals.py b/scripts/proposals.py index 96904ce4..faa10a83 100755 --- a/scripts/proposals.py +++ b/scripts/proposals.py @@ -38,7 +38,10 @@ def getpage(url): pagecount = 1 for link in resp.links.values(): if link['rel'] == 'last': - pagecount = int(re.search('page=(.+)', link['url']).group(1)) + # we extract the pagecount from the `page` param of the last url + # in the response, eg + # 'https://api.github.com/repositories/24998719/issues?state=all&labels=proposal&page=10' + pagecount = int(re.search('page=(\d+)', link['url']).group(1)) val = resp.json() if not isinstance(val, list):