Given a list of company names, search google to retrieve their email addresses:-
import re
import pandas as pd
import numpy as np
import requests, lxml.html
from bs4 import BeautifulSoup
import urllib.request
list_of_url = ['http://umaryusuf.com', 'another website']
# REGEX to search for emails...
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
unique_emails_list = []
for name in list_of_url:
search_query = name + " email"
print('Processing...', name)
# -------------- FOR BULK GOOGLE SEARCH USE A PROXY -----------------
params = (
('api_key', 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'),
('url', 'https://www.google.com/search?q='+search_query),
)
response = requests.get('http://api.scraperapi.com/', params=params)
# -------------------------------------------------------------------
print(response.status_code)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
emails_1 = [re_match.group() for re_match in re.finditer(EMAIL_REGEX, text)]
emails_2 = re.findall(r"[A-Za-z0-9._%+-]+"
r"@[A-Za-z0-9.-]+"
r"\.[A-Za-z]{2,4}", text)
unique_emails = list(set(emails_1 + emails_2))
data = name, unique_emails
unique_emails_list.append(data)
print(data)
Given a list of company domain names, access each domain web page and get all emails from the web page:-
import re
import pandas as pd
import numpy as np
import requests, lxml.html
from bs4 import BeautifulSoup
import urllib.request
list_of_url = ['http://umaryusuf.com']
site_list = []
for domain in list_of_url:
print('Processing...', domain)
try:
f = urllib.request.urlopen(domain)
s = f.read().decode('ISO-8859-1')
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
newemails = list(set(emails))
d = domain, newemails
site_list.append(d)
print (d)
except Exception:
d = domain, 'Error Occured'
site_list.append(d)
print (d)
print("Finished...")
Enjoy!
No comments:
Post a Comment