GeophyAI

Python与地球物理数据处理

0%

使用bs4解析网站内容-以抓取GJI网站的文献词条为例

按照卷号和期号抓取GJI(Geophysical Journal International)论文

论文词条输出格式如下:

1
序号. 作者列表, Geophysical Journal International, doi.

论文抓取函数如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_seg_paper_lists(volume_no, issue_no, write_path):

url = 'https://academic.oup.com/gji/issue/'+ '%02d'%(volume_no) + '/%d'%(issue_no)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
html = requests.get(url, headers = headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, "html.parser")
issues = soup.find_all('div', 'al-article-item-wrap al-normal')
contents = []
for issue in issues:
#authors = issue.find('div','al-authors-list')
try:
authors = get_authors(issue)
except:
continue
title = issue.find('h5', 'customLink item-title').find('a').string
title = title.title() if title else 'None'
zipped = issue.find('div', 'pub-history-row clearfix')

ref_mid = zipped.find('div', 'ww-citation-primary').contents[1].title()
doi = zipped.find('div', 'ww-citation-primary').find('a').contents[0].title()

authors = 'None' if authors is None else authors
if all([authors, title, doi]):
ref = ref_type2(authors,
title,
doi.title())
contents.append(ref)

# Write into files:
with open(write_path, 'w', encoding='utf-8') as f:
f.write('Volume%d_Issue%d'%(volume_no, issue_no)+'\n\n')
for idx, ref in enumerate(contents):
f.write('%02d. '%(idx)+ref)
f.write('\n')

print('Lists have been written into %s'%(write_path))

用到的其它函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def combine_authors(author_list):
if len(author_list)==1:
return author_list[0]
if len(author_list)==2:
return ' and '.join(author_list)
if len(author_list)>2:
new_list = ', '.join(author_list[0:-1])
new_list += ' and '+author_list[-1]
return new_list

def ref_type2(author_list, title, doi):

ref = ', '.join([author_list,
title,
'Geophysical Journal International',
doi.lower()+'.'])
return ref

def get_authors(issue):
authors = issue.find('div', 'al-authors-list')
author_list = [d.string.title() for d in authors.find_all('a')]
author_list = combine_authors(author_list)
return author_list

程序运行需执行如下命令:
1
get_seg_paper_lists(卷号, 期号, 文件写入路径)