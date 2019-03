from pyquery import PyQuery as pq import requests import lxml import json import pandas as pd import re from hyperlink import URL def loadPage(url2, deptName): doc = pq(url2, headers={'user-agent': 'pyquery'}, encoding='utf-8') listOfLinks = doc('a[href*=".pdf"]') name = deptName url = url2 pdfUrl = ""

pdfMapping = { '漁農自然護理署': 'https://www.afcd.gov.hk/tc_chi/eservices/eservice_it/files/Annual_Open_Data_Plan_AFCD_2018_tc.pdf', '香港天文台': 'https://www.hko.gov.hk/abouthko/annual_open_data_plan/doc/AnnualOpenDataPlanFor2018_TraditionalChinese.pdf', '民政事務局': 'https://www.hab.gov.hk/file_manager/tc/documents/about_hab/annual_open_data_plan_2018.pdf', '發展局': 'https://www.devb.gov.hk/filemanager/TC/content_1120/Annual_open_data_plan_2018_of_DEVB.pdf', '在職家庭津貼辦事處': 'https://www.wfsfaa.gov.hk/wfao/pdf/Annual_Open_Data_Plan_for_2018_TC.pdf' } if deptName in pdfMapping.keys(): pdfUrl = pdfMapping[deptName] print("{} {} {}".format(name, url, pdfUrl)) return name, url, pdfUrl

filePath = "" for item in listOfLinks.items(): filePath = item.attr('href') prog = re.compile('annual|open|aodp|plan', re.IGNORECASE) isFound = prog.search(filePath) if isFound != None: pdfLink = filePath.replace('\\','/') # print("{}".format(pdfLink))

baseurl = '/'.join(url2.split('/')[:len(url2.split('/')) - 1]) + '/' if pdfLink[0] == '/': baseurl = '/'.join(url2.split('/')[:3]) if pdfLink[:5] == 'http': baseurl = '' if pdfLink[:2] == './': pdfLink = pdfLink[2:] url3 = URL.from_text(baseurl) pdfUrl = url3.click(pdfLink).to_text() print("{} {} {}".format(name, baseurl, pdfUrl)) return name, url, pdfUrl