Android – Parse JS generated urls with JSOUP

(See UPDATE below, first/accepted solution didn’t met the android requirement, but is left for reference.)


Desktop Solution

HtmlUnit doesn’t seem able to handle this site (often the case, lately). So I don’t have a plain java solution either, but you could use PhantomJS: download the binary for your os, create a script file, start the process from within your java code and parse the output with a dom parser like jsoup.

Script file (here called simple.js):

var page = require('webpage').create();
var fs = require('fs');
var system = require('system');

var url = "";
var fileName = "output";
// first parameter: url
// second parameter: filename for output
console.log("args length: " + system.args.length);

if (system.args.length > 1) {
    url=system.args[1];
}
if (system.args.length > 2){
    fileName=system.args[2];
}
if(url===""){
    phantom.exit();
}

page.settings.userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36";
page.settings.loadImages = false; 

page.open(url, function(status) {
    console.log("Status: " + status);
    if(status === "success") {
        var path = fileName+'.html';
        fs.write(path, page.content, 'w');
    }
    phantom.exit();
});

Java code (example to get title and cover-url):

try {
    //change path to phantomjs binary and your script file
    String outputFileName = "srulad";
    String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";
    String scriptFile = "simple.js";

    String urlParameter = "http://srulad.com/#page-2";

    new File(outputFileName+".html").delete();

    Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + outputFileName);
    process.waitFor();

    Document doc = Jsoup.parse(new File(outputFileName + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js
    Elements elements = doc.select("#list_page-2 > div");

    for (Element element : elements) {
        System.out.println(element.select("div.l-description.float-left > div:nth-child(1) > a").first().attr("title"));
        System.out.println(element.select("div.l-image.float-left > a > img.lazy").first().attr("data-original"));
    }
} catch (IOException | InterruptedException e) {
    e.printStackTrace();
}

Output:

სიყვარული და მოწყალება / Love & Mercy
http://srulad.com/assets/uploads/42410_Love_and_Mercy.jpg
მუზა / The Muse
http://srulad.com/assets/uploads/43164_large_qRzsimNz0eDyFLFJcbVLIxlqii.jpg
...

UPDATE

Parsing of websites with javascript based dynamic content in Android is possible using WebView and jsoup.
The following example app uses a javascript enabled WebView to render a Javascript dependent website. With a JavascriptInterface the html source is returned, parsed with jsoup and as a proof of concept the titles and the urls to the cover-images are used to populate a ListView. The buttons decrement or increment the page number triggering an update of the ListView. Note: tested on an Android 5.1.1/API 22 device.

add internet permission to your AndroidManifest.xml

<uses-permission android:name="android.permission.INTERNET" />

activity_main.xml

<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
    android:orientation="vertical"
    android:layout_width="match_parent"
    android:layout_height="match_parent">

    <LinearLayout
        android:orientation="horizontal"
        android:layout_width="match_parent"
        android:layout_height="wrap_content">

        <Button
            android:layout_width="wrap_content"
            android:layout_height="wrap_content"
            android:text="@string/page_down"
            android:id="@+id/buttonDown"
            android:layout_weight="0.5" />

        <Button
            android:layout_width="wrap_content"
            android:layout_height="wrap_content"
            android:text="@string/page_up"
            android:id="@+id/buttonUp"
            android:layout_weight="0.5" />
    </LinearLayout>

    <ListView
        android:layout_width="match_parent"
        android:layout_height="0dp"
        android:id="@+id/listView"
        android:layout_gravity="bottom"
        android:layout_weight="0.5" />
</LinearLayout>

MainActivity.java

public class MainActivity extends AppCompatActivity {

    private final Handler uiHandler = new Handler();
    private ArrayAdapter<String> adapter;
    private ArrayList<String> entries = new ArrayList<>();
    private ProgressDialog progressDialog;

    private class JSHtmlInterface {
        @android.webkit.JavascriptInterface
        public void showHTML(String html) {
            final String htmlContent = html;

            uiHandler.post(
                new Runnable() {
                    @Override
                    public void run() {
                        Document doc = Jsoup.parse(htmlContent);
                        Elements elements = doc.select("#online_movies > div > div");
                        entries.clear();
                        for (Element element : elements) {
                            String title = element.select("div.l-description.float-left > div:nth-child(1) > a").first().attr("title");
                            String imgUrl = element.select("div.l-image.float-left > a > img.lazy").first().attr("data-original");
                            entries.add(title + "\n" + imgUrl);
                        }
                        adapter.notifyDataSetChanged();
                    }
                }
            );
        }
    }


    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        ListView listView = (ListView) findViewById(R.id.listView);
        adapter = new ArrayAdapter<>(this, android.R.layout.simple_list_item_1, android.R.id.text1, entries);
        listView.setAdapter(adapter);

        progressDialog = ProgressDialog.show(this, "Loading","Please wait...", true);
        progressDialog.setCancelable(false);

        try {
            final WebView browser = new WebView(this);
            browser.setVisibility(View.INVISIBLE);
            browser.setLayerType(View.LAYER_TYPE_NONE,null);
            browser.getSettings().setJavaScriptEnabled(true);
            browser.getSettings().setBlockNetworkImage(true);
            browser.getSettings().setDomStorageEnabled(false);
            browser.getSettings().setCacheMode(WebSettings.LOAD_NO_CACHE);
            browser.getSettings().setLoadsImagesAutomatically(false);
            browser.getSettings().setGeolocationEnabled(false);
            browser.getSettings().setSupportZoom(false);

            browser.addJavascriptInterface(new JSHtmlInterface(), "JSBridge");

            browser.setWebViewClient(
                new WebViewClient() {

                    @Override
                    public void onPageStarted(WebView view, String url, Bitmap favicon) {
                        progressDialog.show();
                        super.onPageStarted(view, url, favicon);
                    }

                    @Override
                    public void onPageFinished(WebView view, String url) {
                        browser.loadUrl("javascript:window.JSBridge.showHTML('<html>'+document.getElementsByTagName('html')[0].innerHTML+'</html>');");
                        progressDialog.dismiss();
                    }
                }
            );

            findViewById(R.id.buttonDown).setOnClickListener(new View.OnClickListener() {
                @Override
                public void onClick(View view) {
                    uiHandler.post(new Runnable() {
                        @Override
                        public void run() {
                            int page = Integer.parseInt(browser.getUrl().split("-")[1]);
                            int newPage = page > 1 ? page-1 : 1;
                            browser.loadUrl("http://srulad.com/#page-" + newPage);
                            browser.loadUrl(browser.getUrl()); // not sure why this is needed, but doesn't update without it on my device
                            if(getSupportActionBar()!=null) getSupportActionBar().setTitle(browser.getUrl());
                        }
                    });
                }
            });

            findViewById(R.id.buttonUp).setOnClickListener(new View.OnClickListener() {
                @Override
                public void onClick(View view) {
                    uiHandler.post(new Runnable() {
                        @Override
                        public void run() {
                            int page = Integer.parseInt(browser.getUrl().split("-")[1]);
                            int newPage = page+1;
                            browser.loadUrl("http://srulad.com/#page-" + newPage);
                            browser.loadUrl(browser.getUrl()); // not sure why this is needed, but doesn't update without it on my device
                            if(getSupportActionBar()!=null) getSupportActionBar().setTitle(browser.getUrl());
                        }
                    });
                }
            });

            browser.loadUrl("http://srulad.com/#page-1");
            if(getSupportActionBar()!=null) getSupportActionBar().setTitle(browser.getUrl());

        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

Leave a Comment

tech