Code
bs4, Selenium - RISS 논문 검색 자동화
BIGFROG
2022. 6. 1. 23:22
with Python
# -*- coding: utf-8 -*-
# * auto-parsing DB
import requests
from bs4 import BeautifulSoup
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
def eliminate_space(string):
string = string.replace("\t", "")
string = string.replace("\n", "")
string = string.replace("\r", "")
string = string.replace("\xa0", "")
string = string.replace(" ", "")
return string
count = 0
query_list = ['지식재산','지적재산','산업재산','특허','실용신안','발명','기술이전','기술사업화','상표','디자인권','디자인 보호','저작권','신지식재산','지리적 표시','퍼블리시티권','유전자원','전통지식','영업비밀'] #*검색어
filename = "test2"
f = open(filename+".txt", "w", encoding="utf-8")
start = time.time()
csv_writer = csv.writer(f)
csv_writer.writerow(
["대분류", "중분류", "제목", "1저자", "2저자", "3저자", "4저자", "발행기관", "발행년도", "학회명", "권호", "키워드","국문초록"]
)
driver_path = "chromedriver"
driver = webdriver.Chrome(driver_path)
url0 = "http://www.riss.kr"
url1 = "/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&queryText=&strQuery="
url2 = "&exQuery=pyear%3A2021◈&exQueryText=발행연도+%5B2021%5D%40%40pyear%3A2021◈&order=%2FDESC&onHanja=false&strSort=RANK&p_year1=&p_year2=&iStartCount="
start_count = 0
url3 = "&orderBy=&mat_type=&mat_subtype=&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=%ED%8A%B9%ED%97%88&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=re_a_kor&colName=re_a_kor&pageScale=10&isTab=Y®nm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query="
for query in query_list:
#os.system("cls")
url = url0 + url1 + url2 + str(start_count) + url3 + query
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")
dd_num = soup.find("span", {'class':'num'})
print(query)
try:
num = dd_num.text.strip()
except:
continue
for start_count in range(0, int(num), 10):
print(f"start_count: {start_count} of {num}")
url = url0 + url1 + url2 + str(start_count) + url3 + query
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")
divs = soup.find_all("div", {"class": "srchResultListW"})
for div in divs:
p_titles = div.find_all("p", {"class": "title"})
if not p_titles:
continue
for p in p_titles:
count += 1
content_url = p.find("a")["href"]
if "javascript" in content_url:
continue
url = url0 + content_url
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
title = soup.find("h3", "title")
title = title.text.strip().replace("\t", "").split("\n")
print("= = " * 20)
lis = soup.find("div", {"class": "infoDetail"}).find_all("li")
authors = eliminate_space(lis[0].find("p").text.strip()).split(";")
publisher = eliminate_space(lis[1].find("p").text.strip())
paper = eliminate_space(lis[2].find("p").text.strip())
volume = eliminate_space(lis[3].find("p").text.strip())
year = eliminate_space(lis[4].find("p").text.strip())
kword = eliminate_space(lis[6].find("p").text.strip()).replace(";", ",")
for author_idx in range(len(authors)):
if "(" in authors[author_idx]:
idx = authors[author_idx].index("(")
authors[author_idx] = authors[author_idx][:idx]
if ")" in authors[author_idx]:
authors[author_idx] = " "
while True:
if len(authors) == 4:
break
elif len(authors) < 4:
authors.append(" ")
elif len(authors) >= 5:
break
for i in range(1, len(authors) - 1):
if authors[i] == " " and authors[i + 1] != " ":
authors[i] = authors[i + 1]
authors[i + 1] = " "
if len(authors) >= 5:
continue
div = soup.find("div", {"class": "innerCont"})
#print(div)
try:
abstract = div.find("div",{"class":"textWrap","id":"abs1"}).text.strip()
if abstract[0].encode().isalpha():
abstract = div.find("div",{"class":"textWrap","id":"abs2"}).text.strip()
except:
abstract = ""
print(f"[{count}]",query,authors,title[0], publisher, paper, volume, year)
csv_writer.writerow(
[
" ",
query,
title[0],
authors[0],
authors[1],
authors[2],
authors[3],
publisher,
year,
paper,
volume,
kword,
abstract
]
)
f.close()
timelapse = time.time() - start
print(f"{timelapse/60:.2f} min..")