Sample HtmlUnit Automation for Finding ASIN on Amazon.com

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.*;  
import com.gargoylesoftware.htmlunit.html.*;
public class AsinFinder {
 /**
  * @param args
  */
 static final WebClient browser;
    static {
        browser = new WebClient(BrowserVersion.INTERNET_EXPLORER_8);
        browser.setJavaScriptEnabled(false);
        browser.setRedirectEnabled(true);
        browser.setTimeout(60000);
    }
    
 @SuppressWarnings("unchecked")
 public static void main(String[] args) {
  // TODO Auto-generated method stub
  String textToSearchInSearchBar = "iphone";
  String textToSearchInDetailPage = "Some Text";
  String webSiteUrl = "http://www.amazon.com";
  String userName = "xxxxx";
  String passWord = "xxxxx";
  String folderForDumpingPageSnapshots = "C:/Users/saikatd/ToDelete";
  String enableOneClickButtonXPathBySrc = "(//input[contains(@src, 'enable-one-click')])";
  String goButtonXPathByValue = "(//input[contains(@value, 'Go')])";
  String goButtonXPathByAlt = "(//input[contains(@alt, 'Go')])";
  String searchTextBoxId = "twotabsearchtextbox";
  String nextPageLinkId = "pagnNextLink";
  
  HtmlPage currentPage = null;
        try {
         deleteRecursively(new File(folderForDumpingPageSnapshots));
         
         //Navigate to Sign-In Page
         currentPage = (HtmlPage) browser.getPage(webSiteUrl + "/gp/flex/sign-out.html/ref=pd_irl_gw_r/?%5Fencoding=UTF8&path=%2Fgp%2Fyourstore&signIn=1&useRedirectOnSuccess=1&action=sign-out");
         currentPage.save(new File(folderForDumpingPageSnapshots + "/loginPage.html"));
         
         //Enter credentials
         HtmlForm signInForm = (HtmlForm) currentPage.getForms().get(0);
         HtmlTextInput emailField = (HtmlTextInput) signInForm.getInputByName("email"); 
         emailField.setValueAttribute(userName);
         HtmlPasswordInput passwordField = (HtmlPasswordInput) signInForm.getInputByName("password"); 
         passwordField.setValueAttribute(passWord);
         
         //Click on Sign-In button
         HtmlImageInput button = (HtmlImageInput) signInForm.getInputsByValue("Continue").get(0);
         currentPage = (HtmlPage)button.click(); 
         currentPage.save(new File(folderForDumpingPageSnapshots + "/afterSignIn.html"));
         
         //Navigate to Manage Address Page
         currentPage = (HtmlPage) browser.getPage(webSiteUrl + "/gp/css/account/address/view.html");
         currentPage.save(new File(folderForDumpingPageSnapshots + "/manageAddress.html"));
         //Check if 1-Click is enabled or not
         List<HtmlElement> elements = (List<HtmlElement>) currentPage.getByXPath(enableOneClickButtonXPathBySrc);
          if(elements.size() != 0) {
           HtmlElement element = elements.get(0);
                    currentPage = element.click();
                }
         currentPage.save(new File(folderForDumpingPageSnapshots + "/afterTurningOnOneClick.html"));
         
         //Type iPhone in the search bar
         HtmlTextInput searchTextBox = (HtmlTextInput) currentPage.getElementById(searchTextBoxId);
         searchTextBox.setValueAttribute(textToSearchInSearchBar);
         
         //Click on the Go button
         //New navigation bar
         elements = (List<HtmlElement>) currentPage.getByXPath(goButtonXPathByValue);
      if(elements.size() != 0) {
       HtmlElement element = elements.get(0);
                currentPage = element.click();
            }
      
      //Old navigation bar
      else {
       elements = (List<HtmlElement>) currentPage.getByXPath(goButtonXPathByAlt);
          if(elements.size() != 0) {
           HtmlElement element = elements.get(0);
                    currentPage = element.click();
                }
      }
      currentPage.save(new File(folderForDumpingPageSnapshots + "/afterSearching.html"));
      
      int asinCount = 1;
      int pageCount = 1;
      //Extract ASINs from the page source 
      do {
       if(pageCount != 1) {
        HtmlAnchor nextLink = (HtmlAnchor) currentPage.getElementById(nextPageLinkId);
           currentPage = nextLink.click();
           currentPage.save(new File(folderForDumpingPageSnapshots + "/SearchPage" + pageCount + ".html"));
           }
           ArrayList<String> asinList = new ArrayList<String>(); 
              Document document = Jsoup.parse(currentPage.getWebResponse().getContentAsString());
                 Elements links = document.select("a");
                 for (Element link : links) {
                  if(!link.absUrl("href").isEmpty() && link.absUrl("href").contains("/dp/")) {
                   int startIndex = link.absUrl("href").indexOf("/dp/");
                   startIndex = startIndex + "/dp/".length();
                   int endIndex = link.absUrl("href").indexOf("/",startIndex);
                   String ASIN = link.absUrl("href").substring(startIndex, endIndex);
                   if(!asinList.contains(ASIN)) {
                    asinList.add(ASIN);
                    }
                   }
                  }
                 //Iterate over the ASINs retrieved
                 for(String ASIN : asinList) {
                  System.out.println("Retrieving Page: " + webSiteUrl + "/dp/" + ASIN);
                  try {
                   HtmlPage detailPage = (HtmlPage) browser.getPage(webSiteUrl + "/dp/" + ASIN);
                         BufferedWriter bw = new BufferedWriter(new FileWriter(folderForDumpingPageSnapshots + "/DetailPage" + asinCount + ".html")); 
                         bw.write(detailPage.getWebResponse().getContentAsString()); 
                         bw.close();
                         if(detailPage.getWebResponse().getContentAsString().contains(textToSearchInDetailPage)) {
                          System.out.println("ASIN Found!!!" + ASIN);
                          asinCount = -1;//for exiting the outer while loop
                          break;
                         }
                         asinCount++;
                  } catch (Exception e) {
                            e.printStackTrace();
                        }
                 }
                 if(asinCount != -1) {
                  pageCount++;
                     System.out.println("Navigating to Page: " + pageCount);
                 }
       } while((HtmlAnchor) currentPage.getElementById(nextPageLinkId) != null && asinCount != -1);
      
      //Check if ASIN was found or not
      if(asinCount != -1) {
       System.out.println("No ASIN Found!!!");
      }
      
        } catch (Exception e) {
            e.printStackTrace();
        }
 }
 
 public static void deleteRecursively(File path) {
  File[] c = path.listFiles();         
  System.out.println("Cleaning out folder:" + path.toString());
  for (File file : c) {
   if (file.isDirectory()) {
    System.out.println("Deleting file:" + file.toString());
    deleteRecursively(file);
    file.delete();
    } else { 
    file.delete();
    }
   }          
  //path.delete();
  }
 }
Advertisements
This entry was posted in Information Technology. Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s