Python小实例(节假日推送)
效果图
整体思路
- 从国务办网站爬取所有URL到本地
chinese_holIDAy_urls.json
{ "20191121": [ "国务院办公厅关于2020年部分节假日安排的通知", "http://www.gov.cn/zhengce/content/2019-11/21/content_5454164.htm" ], ...}
读取选择本年度的发布URL
读取标题包含本年(2020)的URL
每年可能发布多次文件爬取发布关于本年的URL
正则取数据[节假日,多少天,调休日]
chinese_holiday.json
{ "holidays": { "20200101": [ "元旦", "星期三", "1" ], ... }, "workdays": { "20200119": [ "调休日", "星期天" ], ...}
- 节假日数据整合
把周末加入到以上的
holidays
字段
然后holidays
字段所有数据按 key 重新排序
剔除掉加入的周末数据里的调休日
将最后的数据重新保存到 Json 文件
chinese_holiday.json
{ "holidays": { "20200101": [ "元旦", "星期三", "1" ], "20200104": [ "星期六" ], ... }, "workdays": { "20200119": [ "调休日", "星期天" ], ...}
- 推送
使用 apscheduler 每天一次从 chinese_holiday.json 读取数据推送
使用 apscheduler 每月一次从网站爬取更新数据到两个 Json 文件
使用 wxpusher 推送消息
代码部分
导入库
import osimport reimport sysimport jsonimport requestsimport calendarfrom collections import OrderedDictfrom bs4 import BeautifulSoupfrom datetime import datetime, timedeltafrom wxpusher import WxPusherfrom apscheduler.schedulers.blocking import BlockingScheduler
从国务办网站爬取所有URL到本地
知识点01: requests|BeautifulSoup 基础网页处理
知识点02: Json 文件的保存与读取
- ensure_ascii=False 设置为 False, 会禁止中文转码为 Unicode 码
- indent=4 设置 Json 每行缩进为 4 格
- sort_keys=True 设置保存 json 文件时,按key 排序保存
################################################################################ 保存所有节假日发布网址 urldef SaveUrl(url): respose_html = requests.get(url, params=PARAMS, headers=HEADERS) soup_html = BeautifulSoup(respose_html.content, "html.parser") info_elems = soup_html.find_all("td", attrs={"class": "info"}) # today_str = datetime.today().__format__('%Y年%m月%d日') for info_elem in info_elems: push_date_str = info_elem.find_all("li")[-1].text push_date_str = push_date_str.split(':')[-1].strip(' ') push_date_str = ''.join(re.findall('[0-9]+', push_date_str)) href_url = info_elem.find('a')['href'] title = info_elem.find('a').text if os.path.exists(JSON_FILE) and os.path.getsize(JSON_FILE): with open(JSON_FILE, 'r', encoding='utf-8') as ff: json_obj = json.load(ff) if push_date_str in json_obj.keys(): break else: json_obj[push_date_str] = [title, href_url] with open(JSON_FILE, 'w', encoding='utf-8') as ff: json.dump(json_obj, ff, indent=4, sort_keys=True, ensure_ascii=False) else: json_obj = {} json_obj[push_date_str] = [title, href_url] with open(JSON_FILE, 'w', encoding='utf-8') as ff: json.dump(json_obj, ff, indent=4, sort_keys=True, ensure_ascii=False) return JSON_FILE
读取并爬取需要的URL
知识点:正则,正则,还是正则!
官方文档:https://docs.python.org/zh-cn/3/library/re.html
################################################################################ 爬取当年的节假日发布网址# HOLIDAY_DIC = {"20200101":["元旦", "星期三"], }# WORKDAY_DIC = {"20200119": ["调休","星期天"],}def CrawPage(href): global PARAMS, HEADERS, THIS_YEAR, HOLIDAY_DIC, WEEKDAYS respose_html = requests.get(href, params=PARAMS, headers=HEADERS) soup_html = BeautifulSoup(respose_html.content, "html.parser") info_elems = soup_html.find_all("p") for info in info_elems: text = info.text regx = '^.{2}(?P<holiday>.*):(\d{4}年)?(?P<startday>\d+月\d+日)至?' \ '(\d+月)?(?P<endday>\d+日)?放假(调休)?,共(?P<offdays>[1-9]+天)。' \ '(?P<ondays>(\d+月\d+日(星期.)、?)+上班。)?$' re_obj = re.search(regx, text) if re_obj: # 春节 holiday = re_obj.group('holiday') # 1月24日 startday = re_obj.group('startday') startday = str(THIS_YEAR) + ''.join(format_date(startday)) # month = re.search('\d+月', startday).group(0) # 1月30日 # endday = re_obj.group('endday') # if endday is not None: # endday = month + endday # 休假 7 天 offdays = re_obj.group('offdays') offdays = int(re.sub('\D', '', offdays)) start_dt = datetime.strptime(startday, "%Y%m%d") # 放假的日期列表 offdates = list(gen_dates(start_dt, offdays)) for offday in offdates: HOLIDAY_DIC[offday] = re.split('、', holiday) + [WEEKDAYS[get_week(offday)], str(offdays)] # 调休['1月19日','2月1日'] ondays = re_obj.group('ondays') if ondays is not None: ondays = re.findall('\d+月\d+日', ondays) for onday in ondays: onday = str(THIS_YEAR) + ''.join(format_date(onday)) WORKDAY_DIC[onday] = ["调休日", WEEKDAYS[get_week(onday)]]
节假日数据整合
知识点: calendar库的使用
################################################################################ 数据处理# WEEKEND_DIC = {"20200104": "星期六", "20200104": "星期天"}def All_WEEEK(): global WEEKEND_DIC, THIS_YEAR, THIS_MONTH for month in range(THIS_MONTH, 13): month_cal = calendar.monthcalendar(THIS_YEAR, month) for week in month_cal: sa = week[-2] su = week[-1] if sa != 0: date_str = ''.join(format_date(str(month) + '-' + str(sa))) date_str = str(THIS_YEAR) + date_str WEEKEND_DIC[date_str] = "星期六" if su != 0: date_str = ''.join(format_date(str(month) + '-' + str(su))) date_str = str(THIS_YEAR) + date_str WEEKEND_DIC[date_str] = "星期天" return WEEKEND_DIC# MULTI_DIC = {}def HolidayMain(): global HOLIDAY_DIC, WORKDAY_DIC # 计算所有节假日和周末 WEEKEND_DIC = All_WEEEK() for dd in WEEKEND_DIC.keys(): if dd not in HOLIDAY_DIC: HOLIDAY_DIC[dd] = [WEEKEND_DIC[dd]] # 节假日按时间key排序 TEMP_DIC = HOLIDAY_DIC HOLIDAY_DIC = OrderedDict() for key in sorted(TEMP_DIC.keys()): HOLIDAY_DIC[key] = TEMP_DIC[key] # 剔除调休日 for key in WORKDAY_DIC.keys(): if key in HOLIDAY_DIC.keys(): HOLIDAY_DIC.pop(key) MULTI_DIC['holidays'] = HOLIDAY_DIC MULTI_DIC['workdays'] = WORKDAY_DIC # 保存到 json with open(BASE_FILE, 'w', encoding='utf-8') as ff: json.dump(MULTI_DIC, ff, indent=4, sort_keys=True, ensure_ascii=False)
微信推送
需要去 wxpusher 创建应用
使用 markdown 格式发送
主要需要从 json 文件中计算出需要的数据
MESSAGE = """### 节假日推送> 今天:> + {today_key}> + {today_value}>> 明天:> + {tomorrow_key}> + {tomorrow_value}>> 下一个节假日:> + {next_key}> + {next_value}>> 今年剩余节假日:> + {last_holidays} 天"""def MsgPusher(): # 查询用户获取 UID query_user_ret = WxPusher.query_user(1, 100, APP_TOKEN) users_info = query_user_ret['data']['records'] for each_user in users_info: UID_LIST.append(each_user['uid']) with open(BASE_FILE, 'r', encoding='utf-8') as ff: multi_json = json.load(ff) today_key = DATE_NOW.strftime("%Y%m%d") today_value = PushInfo(today_key, multi_json) tomorrow_key = (DATE_NOW + timedelta(days=1)).strftime("%Y%m%d") tomorrow_value = PushInfo(tomorrow_key, multi_json) # 计算下个节假日 for key in multi_json['holidays'].keys(): value = multi_json['holidays'][key] if today_key in multi_json['holidays'].keys(): t_value = multi_json['holidays'][today_key] else: t_value = None if key > today_key and (t_value is None or value[0] != t_value[0]): if len(value) > 2: next_key = key # 计算相差多少天 next_dt = datetime.strptime(next_key, "%Y%m%d") today_dt = datetime.strptime(today_key, "%Y%m%d") diff_days = (next_dt - today_dt).days next_value = "|".join(value[:-1]) + \ ", 再过 {} 天就可以玩 {} 天了哦!". \ format(diff_days, value[-1]) break else: next_key = None next_value = None else: next_key = None next_value = None # 计算今年剩余节假日天数 temp_list = [] for key in multi_json['holidays'].keys(): value = multi_json['holidays'][key] if key > today_key: if len(value) > 2: # 以元组的形式保存到set temp_list.append(value) last_holidays = len(temp_list) message = MESSAGE.format( today_key=today_key, today_value=today_value, tomorrow_key=tomorrow_key, tomorrow_value=tomorrow_value, next_key=next_key, next_value=next_value, last_holidays=str(last_holidays)) print(">>> 开始推送消息") # print(message) result = WxPusher.send_message(message, UID_LIST, APP_TOKEN, content_type=3) print(result) print("<<< 推送消息完成")
完整代码
import osimport reimport sysimport jsonimport requestsimport calendarfrom collections import OrderedDictfrom bs4 import BeautifulSoupfrom datetime import datetime, timedeltafrom wxpusher import WxPusherfrom apscheduler.schedulers.blocking import BlockingSchedulerSHELLNAME = os.path.basename(__file__)BASE_NAME = SHELLNAME.split('.')[0]JSON_FILE = BASE_NAME + '_urls.json'BASE_FILE = BASE_NAME + '.json'DATE_NOW = datetime.now()THIS_YEAR = DATE_NOW.yearTHIS_MONTH = DATE_NOW.monthTHIS_DAY = DATE_NOW.dayHOLIDAY_DIC = OrderedDict()WORKDAY_DIC = OrderedDict()WEEKEND_DIC = OrderedDict()MULTI_DIC = OrderedDict()WEEKDAYS = ("星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期天")# 自己去 wxpusher 申请 tokenAPP_TOKEN = ''UID_LIST = []HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/79.0.3945.79 Safari/537.36', "Accept-Language": "zh-CN,zh;q=0.9"}PARAMS = { "q": "", "n": "100", "t": "paper", "childtype": "", "subchildtype": "gc189", "pcodeJiguan": "国办发明电", "pcodeYear": "", "pcodeNum": "", "location": "综合政务其他", "sort": "publishDate", "searchfield": "puborg", "title": "", "content": "", "pcode": "", "puborg": "", "timetype": "timeqb", "mintime": "", "maxtime": ""}################################################################################ toolsdef useage(): useage_text = """useage: # update json file python {} --update # use last json file python {} """.format(SHELLNAME, SHELLNAME) print(useage_text)# 获取星期def get_week(str): date_obj = datetime.strptime(str, '%Y%m%d') return date_obj.weekday()# 字符串日期,补零def format_date(str): temp_list = re.findall('\d+', str) for each in temp_list: if len(each) == 1: each = '0' + each yield each# 返回日期字符串列表def gen_dates(b_date, days): day = timedelta(days=1) for i in range(days): yield (b_date + day*i).strftime('%Y%m%d')def PushInfo(date_str, multi_json): if date_str in multi_json['holidays'].keys(): date_value = multi_json['holidays'][date_str] if len(date_value) > 2: date_value = "|".join(date_value[:-1]) + ", 好好休息哦!" else: date_value = "|".join(date_value) + ", 好好休息哦!" elif date_str in multi_json['workdays'].keys(): date_value = "|".join(multi_json['workdays'][date_str]) + ", 好好工作哦!" else: date_value = "{}|工作日, 好好工作哦!".format(WEEKDAYS[get_week(date_str)]) return date_valueMESSAGE = """### 节假日推送> 今天:> + {today_key}> + {today_value}>> 明天:> + {tomorrow_key}> + {tomorrow_value}>> 下一个节假日:> + {next_key}> + {next_value}>> 今年剩余节假日:> + {last_holidays} 天"""def MsgPusher(): # 查询用户获取 UID query_user_ret = WxPusher.query_user(1, 100, APP_TOKEN) users_info = query_user_ret['data']['records'] for each_user in users_info: UID_LIST.append(each_user['uid']) with open(BASE_FILE, 'r', encoding='utf-8') as ff: multi_json = json.load(ff) today_key = DATE_NOW.strftime("%Y%m%d") today_value = PushInfo(today_key, multi_json) tomorrow_key = (DATE_NOW + timedelta(days=1)).strftime("%Y%m%d") tomorrow_value = PushInfo(tomorrow_key, multi_json) # 计算下个节假日 for key in multi_json['holidays'].keys(): value = multi_json['holidays'][key] if today_key in multi_json['holidays'].keys(): t_value = multi_json['holidays'][today_key] else: t_value = None if key > today_key and (t_value is None or value[0] != t_value[0]): if len(value) > 2: next_key = key # 计算相差多少天 next_dt = datetime.strptime(next_key, "%Y%m%d") today_dt = datetime.strptime(today_key, "%Y%m%d") diff_days = (next_dt - today_dt).days next_value = "|".join(value[:-1]) + \ ", 再过 {} 天就可以玩 {} 天了哦!". \ format(diff_days, value[-1]) break else: next_key = None next_value = None else: next_key = None next_value = None # 计算今年剩余节假日天数 temp_list = [] for key in multi_json['holidays'].keys(): value = multi_json['holidays'][key] if key > today_key: if len(value) > 2: # 以元组的形式保存到set temp_list.append(value) last_holidays = len(temp_list) message = MESSAGE.format( today_key=today_key, today_value=today_value, tomorrow_key=tomorrow_key, tomorrow_value=tomorrow_value, next_key=next_key, next_value=next_value, last_holidays=str(last_holidays)) print(">>> 开始推送消息") # print(message) result = WxPusher.send_message(message, UID_LIST, APP_TOKEN, content_type=3) print(result) print("<<< 推送消息完成")################################################################################ 保存所有节假日发布网址 urldef SaveUrl(url): respose_html = requests.get(url, params=PARAMS, headers=HEADERS) soup_html = BeautifulSoup(respose_html.content, "html.parser") info_elems = soup_html.find_all("td", attrs={"class": "info"}) # today_str = datetime.today().__format__('%Y年%m月%d日') for info_elem in info_elems: push_date_str = info_elem.find_all("li")[-1].text push_date_str = push_date_str.split(':')[-1].strip(' ') push_date_str = ''.join(re.findall('[0-9]+', push_date_str)) href_url = info_elem.find('a')['href'] title = info_elem.find('a').text if os.path.exists(JSON_FILE) and os.path.getsize(JSON_FILE): with open(JSON_FILE, 'r', encoding='utf-8') as ff: json_obj = json.load(ff) if push_date_str in json_obj.keys(): break else: json_obj[push_date_str] = [title, href_url] with open(JSON_FILE, 'w', encoding='utf-8') as ff: json.dump(json_obj, ff, indent=4, sort_keys=True, ensure_ascii=False) else: json_obj = {} json_obj[push_date_str] = [title, href_url] with open(JSON_FILE, 'w', encoding='utf-8') as ff: json.dump(json_obj, ff, indent=4, sort_keys=True, ensure_ascii=False) return JSON_FILE################################################################################ 爬取当年的节假日发布网址# HOLIDAY_DIC = {"20200101":["元旦", "星期三"], }# WORKDAY_DIC = {"20200119": ["调休","星期天"],}def CrawPage(href): global PARAMS, HEADERS, THIS_YEAR, HOLIDAY_DIC, WEEKDAYS respose_html = requests.get(href, params=PARAMS, headers=HEADERS) soup_html = BeautifulSoup(respose_html.content, "html.parser") info_elems = soup_html.find_all("p") for info in info_elems: text = info.text regx = '^.{2}(?P<holiday>.*):(\d{4}年)?(?P<startday>\d+月\d+日)至?' \ '(\d+月)?(?P<endday>\d+日)?放假(调休)?,共(?P<offdays>[1-9]+天)。' \ '(?P<ondays>(\d+月\d+日(星期.)、?)+上班。)?$' re_obj = re.search(regx, text) if re_obj: # 春节 holiday = re_obj.group('holiday') # 1月24日 startday = re_obj.group('startday') startday = str(THIS_YEAR) + ''.join(format_date(startday)) # month = re.search('\d+月', startday).group(0) # 1月30日 # endday = re_obj.group('endday') # if endday is not None: # endday = month + endday # 休假 7 天 offdays = re_obj.group('offdays') offdays = int(re.sub('\D', '', offdays)) start_dt = datetime.strptime(startday, "%Y%m%d") # 放假的日期列表 offdates = list(gen_dates(start_dt, offdays)) for offday in offdates: HOLIDAY_DIC[offday] = re.split('、', holiday) + [WEEKDAYS[get_week(offday)], str(offdays)] # 调休['1月19日','2月1日'] ondays = re_obj.group('ondays') if ondays is not None: ondays = re.findall('\d+月\d+日', ondays) for onday in ondays: onday = str(THIS_YEAR) + ''.join(format_date(onday)) WORKDAY_DIC[onday] = ["调休日", WEEKDAYS[get_week(onday)]]def CrawMain(): global THIS_YEAR, JSON_FILE with open(JSON_FILE, 'r', encoding='utf-8') as ff: json_obj = json.load(ff) this_year = str(THIS_YEAR) for key, value in json_obj.items(): title = value[0] href = value[1] if this_year in title: CrawPage(href)################################################################################ 数据处理# WEEKEND_DIC = {"20200104": "星期六", "20200104": "星期天"}def All_WEEEK(): global WEEKEND_DIC, THIS_YEAR, THIS_MONTH for month in range(THIS_MONTH, 13): month_cal = calendar.monthcalendar(THIS_YEAR, month) for week in month_cal: sa = week[-2] su = week[-1] if sa != 0: date_str = ''.join(format_date(str(month) + '-' + str(sa))) date_str = str(THIS_YEAR) + date_str WEEKEND_DIC[date_str] = "星期六" if su != 0: date_str = ''.join(format_date(str(month) + '-' + str(su))) date_str = str(THIS_YEAR) + date_str WEEKEND_DIC[date_str] = "星期天" return WEEKEND_DIC# MULTI_DIC = {}def HolidayMain(): global HOLIDAY_DIC, WORKDAY_DIC # 计算所有节假日和周末 WEEKEND_DIC = All_WEEEK() for dd in WEEKEND_DIC.keys(): if dd not in HOLIDAY_DIC: HOLIDAY_DIC[dd] = [WEEKEND_DIC[dd]] # 节假日按时间key排序 TEMP_DIC = HOLIDAY_DIC HOLIDAY_DIC = OrderedDict() for key in sorted(TEMP_DIC.keys()): HOLIDAY_DIC[key] = TEMP_DIC[key] # 剔除调休日 for key in WORKDAY_DIC.keys(): if key in HOLIDAY_DIC.keys(): HOLIDAY_DIC.pop(key) MULTI_DIC['holidays'] = HOLIDAY_DIC MULTI_DIC['workdays'] = WORKDAY_DIC # 保存到 json with open(BASE_FILE, 'w', encoding='utf-8') as ff: json.dump(MULTI_DIC, ff, indent=4, sort_keys=True, ensure_ascii=False)################################################################################ 主函数def update_main(): url = "http://sousuo.gov.cn/list.htm" JSON_FILE = SaveUrl(url) print('[{}]已更新.'.format(JSON_FILE)) CrawMain() print('信息已爬取完成.') HolidayMain() print('[{}]已保存.'.format(BASE_FILE))def cron_main(): print(">>> 开始执行定时任务") sched = BlockingScheduler() sched.add_job(MsgPusher, 'cron', hour='18', minute='0', second='0') sched.add_job(update_main, 'cron', day='1', hour='0', minute='0', second='0') sched.start() print("<<< 定时任务已退出")if __name__ == '__main__': arg_list = sys.argv arg_num = len(arg_list) if arg_num > 2: useage() elif arg_num == 2: arg = arg_list[1] if arg == '--update': update_main() elif arg == '--cron': cron_main() else: useage() else: cron_main()
写在后面
想要直接体验的朋友,可以添加我创建的应用体验
每天早上 9:30 会推送一次
点击我
声明:本站所有资源均由网友分享,如有侵权内容,请在文章下方留言,本站会立即处理。