The code snippet below helps in scrapping data from the legal news website.
Some steps require manual interaction, so the snippet is separated into notebook cells. The # ******** indicates beginning and end of a cell.
url = 'https://www.legalnews.com/Home/Login'
pw = ''
un = ''
path = r"chromedriver.exe"
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)
driver.get(url)
time.sleep(2)
# Login...
driver.find_element(By.XPATH, '//*[@id="email"]').send_keys(un)
driver.find_element(By.XPATH, '//*[@id="password"]').send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="btnlogin"]').click()
time.sleep(3)
driver.find_element(By.XPATH, '//*[@id="top-right"]/div/a[1]').click()
# **********************************************
# -------------------
# Do some manual click to change the table view to table... then run the next cell
# -------------------
# **********************************************
dfList = []
# **********************************************
html_data = driver.page_source
soup = BeautifulSoup(html_data, 'html.parser')
# Read table to df...
tb = pd.read_html(html_data)
# Extract URL of rows...
prod_title = soup.find_all('tr') # ['data-href']
noticeURL_list = []
for t in prod_title:
try:
noticeURL_list.append(f"https://www.legalnews.com{t['data-href']}")
except Exception:
pass
tb[1]['URL'] = noticeURL_list
# Make df a farmiliar dataframe... :)
df = tb[1]
dfList.append(df)
# **********************************************
# Click 'Next Page' btn... *** CLICK ON PAGE 2 MANUALLY TO AVOID ERROR ***
i = 2
p = 2
for x in range(10):
print(f'Clicking on page... {i}')
driver.find_element(By.XPATH, f'//*[@id="divListView"]/div[1]/div[1]/a[{p}]').click()
time.sleep(2)
html_data = driver.page_source
soup = BeautifulSoup(html_data, 'html.parser')
# Read table to df...
tb = pd.read_html(html_data)
# Extract URL of rows...
prod_title = soup.find_all('tr') # ['data-href']
noticeURL_list = []
for t in prod_title:
try:
noticeURL_list.append(f"https://www.legalnews.com{t['data-href']}")
except Exception:
pass
tb[1]['URL'] = noticeURL_list
# Make df a farmiliar dataframe... :)
df = tb[1]
dfList.append(df)
i = i+1
print('Done...')
# **********************************************
df2 = pd.concat(dfList).drop_duplicates()
df2.to_excel(f'LegalNews__April2024_Table1.xlsx', index=False)
df2
That is it!
No comments:
Post a Comment