-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathp_mdx.py
More file actions
72 lines (66 loc) · 2.72 KB
/
p_mdx.py
File metadata and controls
72 lines (66 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
from bs4 import BeautifulSoup
import re
import pymysql
import psycopg2
def create():
#mysql连接数据库
#conn = pymysql.connect("localhost", "root", "root", "test")#连接数据库
#postgresql链接数据库
conn = psycopg2.connect(database="edw", user="gpadmin", password="gpadmin123", host="192.168.0.233", port="5432")
cur = conn.cursor()
cur.execute("drop table if exists t_fun_max")
#mysql脚本
#cur.execute("create table t_fun_max(id int primary key auto_increment,name varchar(200),grammar varchar(1000),describe1 varchar(1000));")
#postgresql脚本
cur.execute("create table t_fun_max(id serial primary key, name varchar,grammar varchar,describe1 varchar);")
conn.commit()
cur.close()
conn.close()
def insert(value):
#mysql连接数据库
#conn = pymysql.connect("localhost", "root", "root", "test")#连接数据库
#postgresql链接数据库
conn = psycopg2.connect(database="edw", user="gpadmin", password="gpadmin123", host="192.168.0.233", port="5432")
cur = conn.cursor()
sql = "insert into t_fun_max(name,grammar,describe1) values(%s, %s, %s)"
try:
cur.execute(sql,value)
conn.commit()
print('插入数据成功')
except Exception as e:
conn.rollback()
print(e)
print("插入数据失败")
conn.close()
create() #创建表
#re匹配需要的数据
pertern = re.compile(r'<a data-linktype="relative-path" href="(.*?)">(.*?)</a>.*?<td>(.*?)</td>',re.S)
url = 'https://docs.microsoft.com/zh-cn/sql/mdx/mdx-function-reference-mdx?view=sql-server-2017'
res = requests.get(url)
res.encoding='utf-8'
#print(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.find_all('tbody')
data = str(data)
items = re.findall(pertern, data)
#获取第一层数据,钻取到第二层详情函数信息进行拼接插入数据库
for i,item in enumerate(items):
#print(item)
#通过第一层的链接地址拼接第二层链接地址获取函数的详情信息
res = requests.get('https://learn.microsoft.com/zh-cn/sql/mdx/'+item[0])
res.encoding='utf-8'
print(res.status_code)
#因为该页面有多组标签数据,只获取第一组数据即可,通过正则表达进行过滤
pertern = re.compile(r'<pre.*?><code.*?>(.*?)</code></pre>[^\1]+',re.S)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.find_all('main')
data = str(data)
fun_text = re.findall(pertern, data)
for k,fitem in enumerate(fun_text):
print(fitem)
#从新拼接参数
v_item = (''+item[1].strip()+'',''+fitem.strip().replace(',','').replace('\n','')+'',''+item[2]+'')
#print(v_item)
#调用插入数据方法
insert(v_item)