Scraping Duckduckgo With Python 3.6
Solution 1:
If I search for Load More
in the source code of the result I can't find it. Did you try using the non-javascript version?
You can use it by simply add html
to the url:
https://duckduckgo.com/html?q=paralegal&t=h_&ia=web
There you can find the next
button at the end.
This one works for me (Chrome version):
results_url = "https://duckduckgo.com/html?q=paralegal&t=h_&ia=web"
browser.get(results_url)
results = browser.find_elements_by_id('links')
num_page_items = len(results)
for i in range(num_page_items):
print(results[i].text)
print(len(results))
nxt_page = browser.find_element_by_xpath('//input[@value="Next"]')
if nxt_page:
browser.execute_script('arguments[0].scrollIntoView();', nxt_page)
nxt_page.click()
Btw.: Duckduckgo also provides a nice api, which is probably much easier to use ;)
edit: fix selector for next page link which selected the prev
button on the second result page (thanks to @kingbode)
Solution 2:
calling class of 'btn--alt' when you go to second page will not work as this is the same class name for both buttons 'Next' and 'Previous', and it was clicking on previous button and return me back again !!
below code change worked for me perfectly
nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
nextButton.click()
full function:
def duckduckGoSearch(query,searchPages = None,filterTheSearch = False,searchFilter = None):
URL_ = 'https://duckduckgo.com/html?'
driver = webdriver.Chrome()
driver.get(URL_)
query = query
searchResults = {}
filterTheSearch = filterTheSearch
searchFilter = searchFilter
searchFilter = searchFilter
# # click on search textBox
# item = driver.find_element_by_xpath('//*[@id="sb_form_q"]').click()
#
# #Enter your search query
item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(query)
# # Click enter to perform the search process
item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(Keys.RETURN)
time.sleep(2)
page_number = 1
while True:
# loop for the required number of pages
if page_number <= searchPages:
try:
nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
nextButton.click()
page_number += 1
try:
webPageSource = driver.page_source
# parse and get the urls for the results
soup = BeautifulSoup(webPageSource, "html.parser")
Data_Set_div_Tags = soup.findAll('h2') + soup.findAll('div', {'class': 'result__body links_main links_deep'})
for i in range(0, len(Data_Set_div_Tags)):
try:
resultDescription = Data_Set_div_Tags[i].findAll('a')[0].text
resultURL = Data_Set_div_Tags[i].findAll('a')[0]['href']
except:
print('nothing to parse')
pass
if resultURL not in searchResults.keys():
if filterTheSearch:
if searchFilter in resultURL:
searchResults[resultURL] = resultDescription
else:
searchResults[resultURL] = resultDescription
except:
print('search is done , found ', len(searchResults), 'Results')
break
# pass
except: # change something so it stops scrolling
print('search is done , found ', len(searchResults), 'Results')
print('no more pages')
driver.quit()
break
else:
print('search is done , found ', len(searchResults), 'Results')
driver.quit()
break
return searchResults
Post a Comment for "Scraping Duckduckgo With Python 3.6"