白荼日记 - 爬虫

河北住建厅公告提取油猴脚本

2025-06-28T17:21:27+08:00

// ==UserScript== // @name 河北住建厅公告提取 // @version 1.0 // @description 点击按钮提取河北住建厅信息，根据页面 URL 自动判断逻辑 // @author YourName // @match https://zfcxjst.hebei.gov.cn/* // @require https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js // @grant GM_xmlhttpRequest // ==/UserScript== (function () { 'use strict'; // 创建按钮 const button = document.createElement('button'); button.innerText = '提取公告'; button.style.position = 'fixed'; button.style.top = '10px'; button.style.right = '10px'; button.style.zIndex = '99999'; button.style.padding = '8px 12px'; button.style.backgroundColor = '#007bff'; button.style.color = 'white'; button.style.border = 'none'; button.style.borderRadius = '4px'; button.style.cursor = 'pointer'; button.style.fontSize = '14px'; button.style.boxShadow = '0 2px 4px rgba(0,0,0,0.2)'; document.body.appendChild(button); // 函数：提取河北住建厅公告数据 function extractHBZJTData_GongGaoGongShi(types) { console.log("页面标题:", document.title); const result = []; const contentDiv = document.querySelector('div.pson_listWenhao'); if (!contentDiv) { console.log("未找到目标 div 元素"); return; } contentDiv.querySelectorAll('li').forEach(li => { const aTag = li.querySelector('a'); const dateSpan = li.querySelector('span.date'); const wenhaoSpan = li.querySelector('span.wenhao'); if (!aTag || !dateSpan) return; let href = aTag.getAttribute('href'); let title = aTag.textContent.trim(); let wenhao = ''; if (wenhaoSpan) { wenhao = wenhaoSpan.textContent.trim(); } let fullUrl = href; if (href.startsWith('/hbzjt')) { fullUrl = new URL(href, 'https://zfcxjst.hebei.gov.cn').href; } result.push({ title: wenhao ? `[${wenhao}] ${title}` : title, url: fullUrl, date: dateSpan.textContent.trim(), type:types }); }); // result.forEach(item => { // console.log(`${item.date}|${item.type} | ${item.title} -> ${item.url}`); // }); // 发送数据到 API sendToAPI(result); } function sendToAPI(dataArray) { GM_xmlhttpRequest({ method: 'POST', url: 'http://192.168.196.81:8081/sys_api/api/buildingspider/batch', data: JSON.stringify(dataArray), headers: { 'Content-Type': 'application/json' }, onload: function(response) { console.log('数据发送成功:', response.responseText); alert('数据已成功发送到服务器！'); }, onerror: function(error) { console.error('数据发送失败:', error); alert('数据发送失败，请检查网络或服务器状态！'); } }); } // 函数：提取河北住建厅公告数据 function extractHBZJTData_XinWenZiXun(types) { console.log("页面标题:", document.title); const result = []; const contentDiv = document.querySelector('div.pson_list'); if (!contentDiv) { console.log("未找到目标 div 元素"); return; } contentDiv.querySelectorAll('li').forEach(li => { const aTag = li.querySelector('a'); const dateSpan = li.querySelector('span.date'); if (!aTag || !dateSpan) return; let href = aTag.getAttribute('href'); let title = aTag.textContent.trim(); let fullUrl = href; if (href.startsWith('/hbzjt')) { fullUrl = new URL(href, 'https://zfcxjst.hebei.gov.cn').href; } result.push({ title: title, url: fullUrl, date: dateSpan.textContent.trim(), type:types }); }); // result.forEach(item => { // console.log(`${item.date}|${item.type} | ${item.title} -> ${item.url}`); // }); // 发送数据到 API sendToAPI(result); } // 备用函数（可根据需要自定义） function fallbackFunction() { console.log("不爬虫，页面标题:", document.title); } // 按钮点击事件 button.addEventListener('click', () => { const currentUrl = window.location.href; if ( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/zcwj/gggs/') ){ extractHBZJTData_GongGaoGongShi("河北省住房和城乡建设厅,公告公示"); }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/zcwj/tfwj/') ){ extractHBZJTData_GongGaoGongShi("河北省住房和城乡建设厅,厅发文件"); }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/zcwj/gfxwj/') ){ extractHBZJTData_GongGaoGongShi("河北省住房和城乡建设厅,厅发规范性文件"); }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/xwzx/szyw/') ) { extractHBZJTData_XinWenZiXun("河北省住房和城乡建设厅,时政要闻") }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/xwzx/jsyw/') ) { extractHBZJTData_XinWenZiXun("河北省住房和城乡建设厅,建设要闻") } else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/xwzx/sxdt/') ) { extractHBZJTData_XinWenZiXun("河北省住房和城乡建设厅,市县动态") }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/xwzx/mtgz/') ) { extractHBZJTData_GongGaoGongShi("河北省住房和城乡建设厅,媒体关注") }else if( currentUrl.includes('https://zfcxjst.hebei.gov.cn/hbzjt/xwzx/zcjd/') ) { extractHBZJTData_XinWenZiXun("河北省住房和城乡建设厅,政策解读") } else { fallbackFunction(); } }); })();

科技创新型企业爬虫与提醒导入

2025-06-25T09:04:00+08:00

1.每日爬取数据后，将爬取的公告通知，同步到数据库2.检索当日的公告信息，查看是否有科技创新企业，如果有则提醒通知（提醒未写，简单完善数据）取的关键词模糊搜索+排除词排除掉无关条目，来查找响应数据检索是否有科技创新企业的公告/通知# 检查当日数据是否有科创企业名录 import re import time import pymysql import requests from gxt_spider import get_industry from kjt_spider import get_sci_kjt from sdszf_spider import get_sci_sdszf from jinja2 import Template import json def connect_to_database(): connection = pymysql.connect( host='127.0.0.1', user='root', password='123456', database='my_database_test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) return connection def query_today_kc_enterprises(): keywords = [ "科技型中小企业", "高新技术企业", "众创空间", "科技领军企业", "技术先进型服务企业", "技术创新示范企业", "专精特新", "科技企业", "瞪羚", "独角兽", "科技小巨人企业", '小巨人'] not_contain_keywords = ["取消","组织申报","认定和复核","申报","补助名单","绩效评价"] sql = build_sql_query(keywords, not_contain_keywords) connection = connect_to_database() try: with connection.cursor() as cursor: cursor.execute(sql) results = cursor.fetchall() return { "total": len(results), "list": results } finally: connection.close() def build_sql_query(keywords, not_contain_keywords): like_conditions = " OR ".join([f"title LIKE '%{keyword}%'" for keyword in keywords]) not_like_conditions = " and ".join([f"title NOT LIKE '%{not_contain_keyword}%'" for not_contain_keyword in not_contain_keywords]) sql = f""" SELECT CASE type WHEN '1' THEN '山东省科学技术厅' WHEN '2' THEN '山东省工业和技术化厅' WHEN '3' THEN '山东省人民政府' ELSE '未知类型' END AS type_name,date,title,url FROM `sci_spider` WHERE ({like_conditions}) AND ({not_like_conditions}) AND DATE(create_date) = DATE(NOW()) """ return sql def mail_sender(content): import smtplib from email.mime.text import MIMEText from email.header import Header # 第三方 SMTP 服务 mail_host = "smtp.163.com" # 设置服务器 mail_user = "18631839859@163.com" # 用户名 mail_pass = "GENGs7dM45TJDH6y" # 口令 sender = '18631839859@163.com' receivers = ['wonder1999@126.com'] # 接收邮件，可设置为你的QQ邮箱或者其他邮箱 # message = MIMEText(content, 'plain', 'utf-8') message = MIMEText(content, 'html', 'utf-8') message['From'] = Header("科技型中小企业通知", 'utf-8') message['To'] = Header("科技型中小企业", 'utf-8') subject = '科技型中小企业通知' message['Subject'] = Header(subject, 'utf-8') try: smtpObj = smtplib.SMTP() smtpObj.connect(mail_host, 25) # 25 为 SMTP 端口号 smtpObj.login(mail_user, mail_pass) smtpObj.sendmail(sender, receivers, message.as_string()) print("邮件发送成功") except smtplib.SMTPException: print("Error: 无法发送邮件") def wx_web_hook(data): """ 通过企业微信Webhook发送Markdown格式的消息 :param data: 包含通知数据的字典，结构应包含'total'和'list'键 :return: None """ # Webhook地址（请替换为你的实际Key） webhook_url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=ef84945d-2247-4f09-ac0b-be7a6607c24e" # 构造Markdown内容 content = f"**找到 {data['total']} 条疑似符合条件的记录：**\n" for row in data['list']: content += ( f"- [{row['title']}]({row['url']}) " f"{row['date']} " f"{row['type_name']}\n" ) # 构建请求体 payload = { "msgtype": "markdown", "markdown": { "content": content } } # 发送请求并处理响应 try: response = requests.post(webhook_url, json=payload) response.raise_for_status() # 抛出HTTP错误 result = response.json() if result.get("errcode") == 0: print("✅ 消息发送成功") else: print(f"❌ 消息发送失败: {result.get('errmsg')}") except requests.exceptions.RequestException as e: print(f"⚠️ 请求异常: {e}") if __name__ == '__main__': get_industry(1, 2) get_sci_kjt(1, 1) get_sci_sdszf(1, 3) data = query_today_kc_enterprises() title = f"找到 {data['total']} 条疑似符合条件的记录：" for row in data['list']: print(row) if data['total'] > 0: wx_web_hook(data) # mail_sender('测试消息')工信厅爬虫import re import time import pymysql import requests # 数据库链接 def connect_to_database(): connection = pymysql.connect( host='127.0.0.1', user='root', password='123456', database='my_database_test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) return connection def find_new_date(): connection = connect_to_database() try: with connection.cursor() as cursor: sql = "SELECT date FROM `sci_spider` WHERE type = '2' ORDER BY DATE(date) DESC LIMIT 0,1" cursor.execute(sql) results = cursor.fetchall() return results[0]['date'] except Exception as e: return '' connection.close() finally: connection.close() def get_industry(page_num, type): url = (f'http://gxt.shandong.gov.cn/col/col15201/index.html?uid=586830&pageNum={page_num}') user_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" headers = { "Referer": None, "User-Agent": user_Agent } while True: try: response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' response = response.text break except: print("请求失败，尝试睡眠一会(半小时)") sleep_time = 60 * 30 time.sleep(sleep_time) print("睡眠结束，继续运行...") continue da = re.findall(r'

(.*?) ', response) in_url = re.findall(r'target="_blank" href="(.*?)">', response) content = re.findall(r' new_date: count = count + 1 cursor.execute(sql, (content[i], in_url[i], da[i][0:10], type)) connection.commit() print(f"已成功插入 {count} 条数据") except Exception as e: print(f"插入数据失败: {e}") connection.rollback() finally: connection.close() if __name__ == '__main__': get_industry(1, 2)科技厅爬虫import re import time import pymysql import requests def connect_to_database(): connection = pymysql.connect( host='127.0.0.1', user='root', password='123456', database='my_database_test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) return connection def find_new_date(): connection = connect_to_database() try: with connection.cursor() as cursor: sql = "SELECT date FROM `sci_spider` WHERE type = '1' ORDER BY DATE(date) DESC LIMIT 0,1" cursor.execute(sql) results = cursor.fetchall() return results[0]['date'] except Exception as e: return '' connection.close() finally: connection.close() def get_sci_kjt(page_num, type): url = (f'http://kjt.shandong.gov.cn/col/col13360/index.html?uid=85651&pageNum={page_num}') user_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" headers = { "Referer": None, "User-Agent": user_Agent } while True: try: response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' response = response.text break except: print("请求失败，尝试睡眠一会(半小时)") sleep_time = 60 * 30 time.sleep(sleep_time) print("睡眠结束，继续运行...") continue da = re.findall(r'(.*?)', response) sci_url = re.findall(r'href="(.*?)" class="ellipsis-line-clamp">', response) content = re.findall(r'(.*?)', response) for i in range(0, len(da)): print(str(i+1) + ' : ' + da[i][0:10] + ' : '+content[i]+ ' : ' + sci_url[i]) if len(da) != len(sci_url) or len(da) != len(content): print("数据不完整，跳过插入") return new_date = find_new_date() if not new_date or new_date == '': new_date = '1970-01-01' # 默认最小日期 connection = connect_to_database() try: with connection.cursor() as cursor: sql = """ INSERT INTO `my_database_test`.`sci_spider` (`title`, `url`, `date`, `type`, `create_date`) VALUES (%s, %s, %s, %s, NOW()) """ count = 0 for i in range(len(da)): if da[i] > new_date: count = count + 1 cursor.execute(sql, (content[i], sci_url[i], da[i], type)) connection.commit() print(f"已成功插入 {count} 条数据") except Exception as e: print(f"插入数据失败: {e}") connection.rollback() finally: connection.close() if __name__ == '__main__': get_sci_kjt(1, 1)山东省人民政府爬虫import re import time import pymysql import requests def connect_to_database(): connection = pymysql.connect( host='127.0.0.1', user='root', password='123456', database='my_database_test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) return connection def find_new_date(): connection = connect_to_database() try: with connection.cursor() as cursor: sql = "SELECT date FROM `sci_spider` WHERE type = '3' ORDER BY DATE(date) DESC LIMIT 0,1" cursor.execute(sql) results = cursor.fetchall() return results[0]['date'] except Exception as e: return '' connection.close() finally: connection.close() def get_sci_sdszf(page_num, type): url = (f'http://www.shandong.gov.cn/col/col94237/index.html?uid=633233&pageNum={page_num}') user_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" headers = { "Referer": None, "User-Agent": user_Agent } while True: try: response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' response = response.text break except: print("请求失败，尝试睡眠一会(半小时)") sleep_time = 60 * 30 time.sleep(sleep_time) print("睡眠结束，继续运行...") continue # 提取日期 da = re.findall(r'\s*(\d{4}-\d{2}-\d{2})\s*', response) # 提取链接 sci_url = re.findall(r'href="(.*?)"\s+target="_blank"\s+title="', response) # 提取标题（title 属性） content = re.findall(r'\s+target="_blank"\s+title="(.*?)"', response) # return print(len(da), len(sci_url), len(content)) for i in range(0, len(da)): print(str(i+1) + ' : ' + da[i][0:10] + ' : '+content[i]+ ' : ' + sci_url[i]) if len(da) != len(sci_url) or len(da) != len(content): print("数据不完整，跳过插入") return new_date = find_new_date() if not new_date or new_date == '': new_date = '1970-01-01' # 默认最小日期 connection = connect_to_database() try: with connection.cursor() as cursor: sql = """ INSERT INTO `my_database_test`.`sci_spider` (`title`, `url`, `date`, `type`, `create_date`) VALUES (%s, %s, %s, %s, NOW()) """ count = 0 for i in range(len(da)): if da[i] > new_date: count = count + 1 cursor.execute(sql, (content[i], sci_url[i], da[i], type)) connection.commit() print(f"已成功插入 {count} 条数据") except Exception as e: print(f"插入数据失败: {e}") connection.rollback() finally: connection.close() if __name__ == '__main__': get_sci_sdszf(1, 3)

python编写接口服务

2025-04-18T14:24:00+08:00

使用Python实现一个简单的接口服务，可以通过get、post方法请求该接口，拿到响应数据。创建一个api\_server.py文件，添加代码如下：import flask from flask import request, jsonify from rescode.constants import ResponseCode from spider.qiXinSpider import getQiXinCompInfo ''' flask： web框架，通过flask提供的装饰器@server.route()将普通函数转换为服务登录接口，需要传url、username、passwd ''' # 创建一个服务，把当前这个python文件当做一个服务 server = flask.Flask(__name__) # server.config['JSON_AS_ASCII'] = False # @server.route()可以将普通函数转变为服务登录接口的路径、请求方式 @server.route('/python-api/getCompTageFromQiXin', methods=['get', 'post']) def getCompTageFromQiXin(): try: # 获取通过url请求传参的数据 httpUrl = request.values.get('httpUrl') if not httpUrl: return jsonify(ResponseCode.PARAM_REQUIRED), 400 if 'www.sdxyjq.com:8080' in httpUrl: httpUrl = httpUrl.replace('www.sdxyjq.com:8080', 'www.sdxyjq.com') # 调用qiXinSpider.py里面的函数，需要传入 # chrome的路径 D:\\APP\\TestChrome2\\Application\\chrome.exe # 信用金桥的http链接的url地址 comp_info = getQiXinCompInfo(httpUrl,'D:\\APP\\TestChrome2\\Application\\chrome.exe') data = { "httpUrl" : httpUrl, "qiXinSpider" : comp_info, "compName":comp_info['baseInfo']['ename'] } return jsonify({**ResponseCode.SUCCESS, "data": data}), 200 except Exception as e: # 统一异常捕获 return jsonify(ResponseCode.INTERNAL_ERROR), 500 @server.errorhandler(404) def not_found(error): return jsonify({ "code": 404, "message": "接口不存在" }), 404 @server.errorhandler(500) def internal_error(error): return jsonify(ResponseCode.INTERNAL_ERROR), 500 if __name__ == '__main__': server.run(debug=True, port=8888, host='0.0.0.0')爬虫脚本# -*- encoding:utf-8 -*- import time from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.chrome.options import Options from seleniumwire import webdriver as wiredriver from bs4 import BeautifulSoup import requests import gzip import io import json # 初始化selenium def initialize_driver(chromePath: str): # 配置 Chrome 浏览器选项 chrome_options = Options() chrome_options.add_argument('--disable-gpu') # 禁用 GPU 加速，确保拦截请求正常 chrome_options.add_argument('--headless') # 不打开浏览器 chrome_options.add_argument('--ignore-certificate-errors') # 忽略证书错误 # 添加指定的浏览器路径 chrome_path = chromePath chrome_options.binary_location = chrome_path # 初始化 WebDriver，并传入配置 driver = wiredriver.Chrome(options=chrome_options) return driver # 获取启信宝的地址 def get_qixin_url(url): if 'www.sdxyjq.com:8080' in url: url = url.replace('www.sdxyjq.com:8080', 'www.sdxyjq.com') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" } response = requests.get(url, headers=headers) html_content = response.text soup = BeautifulSoup(html_content, 'html.parser') iframe = soup.find('iframe') qiXinUrl = '' if iframe: src = iframe.get('src') qiXinUrl = src return qiXinUrl # 格式化请求体 def parse_response_body(response_body_binary): try: # 检查数据是否以 gzip 开头 is_gzip = response_body_binary.startswith(b'\\x1f\\x8b') if is_gzip: with gzip.GzipFile(fileobj=io.BytesIO(response_body_binary), mode='rb') as f: return json.loads(f.read().decode('utf-8')) else: # print('直接解码为 JSON') return json.loads(response_body_binary.decode('utf-8')) except Exception as e: print(f"格式化请求体失败: {e}") return None def extract_response_body(requests, keyword): for request in requests: if keyword in request.url: return request.response.body return None def getQiXinCompInfo(url:str,chromePath:str): try: # 初始化浏览器 driver = initialize_driver(chromePath) # 访问启信宝的网页 driver.get(get_qixin_url(url)) time.sleep(3) # 使用 WebDriverWait 等待页面加载完成之后继续操作，等待时间根据网络情况进行调整 wait = WebDriverWait(driver, 30)# 超时时间30s # 等待页面的 document.readyState 变为 "complete" wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") # 获取所有拦截的请求 requests = driver.requests # 获取企业的标签信息 getEntLabel res_getEntLabel = extract_response_body(requests, 'getEntLabel') if res_getEntLabel is not None: res_getEntLabel = parse_response_body(res_getEntLabel) else: res_getEntLabel ='' # 获取企业地址信息 getGeocode res_getGeocode = extract_response_body(requests, 'getGeocode') if res_getGeocode is not None: res_getGeocode = parse_response_body(res_getGeocode) else: res_getGeocode = '' # 获取企业的工商信息 getEntBasicInfoNew res_getEntBasicInfoNew = extract_response_body(requests,'getEntBasicInfoNew') if res_getEntBasicInfoNew is not None: res_getEntBasicInfoNew = parse_response_body(res_getEntBasicInfoNew) else: res_getEntBasicInfoNew = '' return { 'baseInfo': res_getEntBasicInfoNew, 'tagInfo': res_getEntLabel, 'addressInfo': res_getGeocode, } finally: # 关闭浏览器 driver.quit()Flask 是什么？Flask 是一个用 Python 编写的轻量级 Web 框架，它为构建 Web 应用程序和 RESTful API 提供了灵活的基础。Flask 的设计哲学是“简洁和可扩展”，它没有捆绑任何数据库或 ORM（对象关系映射）工具，开发者可以根据需求自由选择技术栈。Flask 的核心特点轻量级与灵活性：没有强制性的数据库或 ORM，开发者可以自由选择技术（如 SQLite、MySQL、MongoDB 等）。不依赖模板引擎，默认提供简单模板，也可以替换为其他引擎（如 Jinja2）。路由系统：通过装饰器（Decorator）将 URL 路径映射到 Python 函数扩展性：通过第三方扩展（Extensions）增强功能，例如：Flask-SQLAlchemy ：数据库操作。Flask-RESTful ：快速构建 RESTful API。Flask-Login ：用户认证。Flask-JWT ：基于 JWT 的身份验证。开发友好：内置调试模式（Debug Mode），实时反映代码修改。支持单元测试和集成测试。社区支持：拥有活跃的开源社区和丰富的文档，适合快速开发和学习。Flask 的典型应用场景小型 Web 应用：适合快速开发个人博客、仪表盘、内部工具等。RESTful API 开发：构建数据接口（如 JSON API），常用于前后端分离项目。微服务架构：由于轻量级特性，适合构建独立的微服务模块。学习 Web 开发：简单的 API 和路由设计使其成为学习 Web 开发的理想工具。Flask 与 Django 的对比特性FLASKDJANGO设计理念轻量级、灵活，最小功能集重量级、全功能， batteries-included默认组件无 ORM、模板引擎（可选）内置 ORM（Django ORM）、模板引擎学习曲线低（简单直接）高（功能丰富但复杂）适用场景小型项目、API、需要高度控制的场景企业级大型项目、快速全栈开发