Saturday, April 6, 2024

Legal news scrapping

 The code snippet below helps in scrapping data from the legal news website.

Some steps require manual interaction, so the snippet is separated into notebook cells. The # ******** indicates beginning and end of a cell.

url = 'https://www.legalnews.com/Home/Login'
pw = ''
un = ''

path = r"chromedriver.exe"

service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)

driver.get(url)
time.sleep(2)

# Login...
driver.find_element(By.XPATH, '//*[@id="email"]').send_keys(un)
driver.find_element(By.XPATH, '//*[@id="password"]').send_keys(pw)
driver.find_element(By.XPATH, '//*[@id="btnlogin"]').click()

time.sleep(3)
driver.find_element(By.XPATH, '//*[@id="top-right"]/div/a[1]').click()
# **********************************************
# -------------------
# Do some manual click to change the table view to table... then run the next cell
# -------------------
# **********************************************

dfList = []
# **********************************************


html_data = driver.page_source
soup = BeautifulSoup(html_data, 'html.parser')

# Read table to df...
tb = pd.read_html(html_data)

# Extract URL of rows...
prod_title = soup.find_all('tr') # ['data-href']

noticeURL_list = []
for t in prod_title:
    try:
        noticeURL_list.append(f"https://www.legalnews.com{t['data-href']}")
    except Exception:
        pass
    
tb[1]['URL'] = noticeURL_list

# Make df a farmiliar dataframe... :)
df = tb[1]

dfList.append(df)
# **********************************************


# Click 'Next Page' btn...  *** CLICK ON PAGE 2 MANUALLY TO AVOID ERROR ***
i = 2
p = 2
for x in range(10):
    print(f'Clicking on page... {i}')
    
    driver.find_element(By.XPATH, f'//*[@id="divListView"]/div[1]/div[1]/a[{p}]').click()
    time.sleep(2)

    html_data = driver.page_source
    soup = BeautifulSoup(html_data, 'html.parser')

    # Read table to df...
    tb = pd.read_html(html_data)

    # Extract URL of rows...
    prod_title = soup.find_all('tr') # ['data-href']

    noticeURL_list = []
    for t in prod_title:
        try:
            noticeURL_list.append(f"https://www.legalnews.com{t['data-href']}")
        except Exception:
            pass

    tb[1]['URL'] = noticeURL_list

    # Make df a farmiliar dataframe... :)
    df = tb[1]
    dfList.append(df)
    i = i+1
    
print('Done...')

# **********************************************

df2 = pd.concat(dfList).drop_duplicates()
df2.to_excel(f'LegalNews__April2024_Table1.xlsx', index=False)
df2


That is it!

No comments:

Post a Comment