Tuesday, February 16, 2021

Get Emails from Google search given company Name/Domain

Given a list of company names, search google to retrieve their email addresses:-

import re
import pandas as pd
import numpy as np

import requests, lxml.html
from bs4 import BeautifulSoup
import urllib.request


list_of_url = ['http://umaryusuf.com', 'another website']

# REGEX to search for emails...
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""

unique_emails_list = []

for name in list_of_url:    
    search_query = name + " email"
    print('Processing...', name)

    # -------------- FOR BULK GOOGLE SEARCH USE A PROXY -----------------
    params = (
            ('api_key', 'XXXXXXXXXXXXXXXXXXXXXXXXXXX'),
            ('url', 'https://www.google.com/search?q='+search_query),
        )
    response = requests.get('http://api.scraperapi.com/', params=params)
    # -------------------------------------------------------------------


    print(response.status_code)

    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()

    emails_1 = [re_match.group() for re_match in re.finditer(EMAIL_REGEX, text)]

    emails_2 = re.findall(r"[A-Za-z0-9._%+-]+"
                         r"@[A-Za-z0-9.-]+"
                         r"\.[A-Za-z]{2,4}", text)

    unique_emails = list(set(emails_1 + emails_2))
    data = name, unique_emails

    unique_emails_list.append(data)
    print(data)


Given a list of company domain names, access each domain web page and get all emails from the web page:-

import re
import pandas as pd
import numpy as np

import requests, lxml.html
from bs4 import BeautifulSoup
import urllib.request


list_of_url = ['http://umaryusuf.com']



site_list = []

for domain in list_of_url:

    print('Processing...', domain)
    
    try:
        f = urllib.request.urlopen(domain)
        s = f.read().decode('ISO-8859-1')
        emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
        newemails = list(set(emails))
        d = domain, newemails

        site_list.append(d)
        print (d)
    except Exception:
        d = domain, 'Error Occured'
        site_list.append(d)
        print (d)

print("Finished...")


Enjoy!

No comments:

Post a Comment