From b8336c26fbdc24b3869bb0479f8bc1c299df1618 Mon Sep 17 00:00:00 2001
From: Nekojimi <Jim@nekojimi.moe>
Date: Thu, 30 Sep 2021 17:07:22 +0100
Subject: [PATCH] Made WebScraperSearcher able to parse different result
 formats (HTML/JSON) and supply query parameters dynamically.

---
 .../musicsearcher/parsers/HTMLParser.java     |  70 ++++++++
 .../musicsearcher/parsers/JSONParser.java     |  72 +++++++++
 .../musicsearcher/parsers/Parser.java         |  30 ++++
 .../musicsearcher/providers/Searcher.java     |  10 +-
 .../providers/WebScraperSearcher.java         | 151 ++++++++++--------
 5 files changed, 263 insertions(+), 70 deletions(-)
 create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java
 create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java
 create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java

diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java
new file mode 100644
index 0000000..446a420
--- /dev/null
+++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java
@@ -0,0 +1,70 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package moe.nekojimi.musicsearcher.parsers;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+
+public class HTMLParser extends Parser<Document, Element> 
+{
+    @Override
+    public Document getDocument(InputStream input) 
+    {
+        try 
+        {
+            Document doc = Jsoup.parse(new String(input.readAllBytes()));
+//            System.out.println(doc.outerHtml());
+            return doc;
+        } 
+        catch (IOException ex) 
+        {
+            Logger.getLogger(HTMLParser.class.getName()).log(Level.SEVERE, null, ex);
+            return null;
+        }
+    }
+
+    @Override
+    public Collection<Element> getResults(Document document, String selector) 
+    {
+        return document.select(selector);
+    }
+
+    @Override
+    public String getField(Element object, String selector) 
+    {
+        Element ele = object.selectFirst(selector);
+        return ele.text();
+    }
+    
+    @Override
+    public URL getURLField(Element object, URL baseURL, String selector) throws MalformedURLException 
+    {
+        Element ele = object.selectFirst(selector);
+        
+        String link;
+        if (ele.hasAttr("href"))
+            link = ele.attr("href");
+        else
+            link = ele.text();
+        
+        URL url;
+        if(!link.startsWith("http"))
+            url = new URL(baseURL, link);
+        else
+            url = new URL(link);
+        
+        return url;
+    }
+}
diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java
new file mode 100644
index 0000000..cbca262
--- /dev/null
+++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java
@@ -0,0 +1,72 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package moe.nekojimi.musicsearcher.parsers;
+
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.List;
+import javax.json.Json;
+import javax.json.JsonObject;
+import javax.json.JsonReader;
+import javax.json.JsonStructure;
+import javax.json.JsonValue;
+
+/**
+ *
+ * @author jim
+ */
+public class JSONParser extends Parser<JsonStructure, JsonObject>
+{
+
+    @Override
+    public JsonStructure getDocument(InputStream input) 
+    {
+        JsonReader reader = Json.createReader(input);
+        return reader.read();
+    }
+
+    @Override
+    public Collection<JsonObject> getResults(JsonStructure document, String selector) 
+    {
+        JsonValue value = document.getValue(selector);
+        if (value.getValueType() == JsonValue.ValueType.ARRAY)
+        {
+            return value.asJsonArray().getValuesAs(JsonObject.class);
+        }
+        else if (value.getValueType() == JsonValue.ValueType.OBJECT)
+        {
+            return List.of(value.asJsonObject());
+        }
+        return List.of();
+    }
+
+    @Override
+    public String getField(JsonObject object, String selector) 
+    {
+        return object.getValue(selector).toString();
+    }
+
+    @Override
+    public URL getURLField(JsonObject object, URL baseURL, String selector) throws MalformedURLException 
+    {
+        String link = getField(object, selector);
+        
+        URL url;
+        if(!link.startsWith("http"))
+            url = new URL(baseURL, link);
+        else
+            url = new URL(link);
+        
+        return url;
+    }
+    
+    private JsonValue navigate(JsonObject from, String selector)
+    {
+        return from.getValue(selector);
+    }
+}
diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java
new file mode 100644
index 0000000..52ff0bd
--- /dev/null
+++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java
@@ -0,0 +1,30 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package moe.nekojimi.musicsearcher.parsers;
+
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+
+/**
+ *
+ * @author jim
+ */
+public abstract class Parser<D,O>
+{
+    public abstract D getDocument(InputStream input);
+    
+    public abstract Collection<O> getResults(D document, String selector);
+    public Collection<O> getResults(InputStream input, String selector)
+    {
+        return getResults(getDocument(input), selector);
+    }
+    
+    public abstract String getField(O object, String selector);
+    public abstract URL getURLField(O object, URL baseURL, String selector) throws MalformedURLException;
+
+}
diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java
index 179f42f..3763ff5 100644
--- a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java
+++ b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java
@@ -12,6 +12,8 @@ import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ForkJoinPool;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
+import moe.nekojimi.musicsearcher.Query;
+import moe.nekojimi.musicsearcher.QueryFieldUnsupportedException;
 import moe.nekojimi.musicsearcher.Result;
 
 /**
@@ -40,7 +42,7 @@ public abstract class Searcher
         return name;
     }
     
-    public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
+    public List<Result> searchAndWait(Query query) throws InterruptedException, ExecutionException
     {
         try 
         {
@@ -51,19 +53,19 @@ public abstract class Searcher
         }
     }
     
-    public List<Result> searchAndWait(String query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
+    public List<Result> searchAndWait(Query query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
     {
         return search(query).get(limit, unit);
     }
     
-    public CompletableFuture<List<Result>> search(String query)
+    public CompletableFuture<List<Result>> search(Query query)
     {
         CompletableFuture<List<Result>> future = new CompletableFuture<>();
         future.completeAsync(()->doSearch(query), executor);
         return future;
     }
 
-    protected abstract List<Result> doSearch(String query);
+    protected abstract List<Result> doSearch(Query query) throws QueryFieldUnsupportedException;
 
     @Override
     public String toString() {
diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java
index 79b736b..748d21e 100644
--- a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java
+++ b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java
@@ -6,20 +6,26 @@
 package moe.nekojimi.musicsearcher.providers;
 
 import com.amihaiemil.eoyaml.YamlMapping;
+import com.amihaiemil.eoyaml.YamlNode;
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.net.URISyntaxException;
 import java.net.URL;
-import java.net.URLEncoder;
-import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import java.util.stream.Collectors;
+import moe.nekojimi.musicsearcher.Query;
 import moe.nekojimi.musicsearcher.Result;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
+import moe.nekojimi.musicsearcher.SecretStore;
+import moe.nekojimi.musicsearcher.parsers.HTMLParser;
+import moe.nekojimi.musicsearcher.parsers.JSONParser;
+import moe.nekojimi.musicsearcher.parsers.Parser;
+import org.apache.http.client.utils.URIBuilder;
 
 /**
  *
@@ -27,14 +33,18 @@ import org.jsoup.select.Elements;
  */
 public class WebScraperSearcher extends Searcher
 {
-    private String searchUrl;
-    private URL rootURL;
+    protected String searchUrl;
+    protected URL rootURL;
     
-    private String resultSelector;
-    private String resultArtistSelector;
-    private String resultTitleSelector;
-    private String resultLinkSelector;
-    private String resultAlbumArtistSelector;
+    protected String resultSelector;
+    protected String resultArtistSelector;
+    protected String resultTitleSelector;
+    protected String resultLinkSelector;
+    protected String resultAlbumArtistSelector;
+    
+    protected Map<String,String> searchFields = new HashMap<>();
+    
+    protected Parser<?,?> parser;
 
     public WebScraperSearcher(String name)
     {
@@ -45,11 +55,12 @@ public class WebScraperSearcher extends Searcher
     {
         super(yaml);
         searchUrl = yaml.string("search_url");
-        rootURL = fillURL("");
-        rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
+        rootURL = new URL(searchUrl);
+//        rootURL = fillURL(Query.fullText(""));
+//        rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
         resultSelector = yaml.string("result_selector");
         
-        YamlMapping fields = yaml.yamlMapping("result_field_selectors");
+        YamlMapping fields = yaml.yamlMapping("result_fields");
         if (fields != null)
         {
             resultArtistSelector = fields.string("artist");
@@ -57,84 +68,93 @@ public class WebScraperSearcher extends Searcher
             resultLinkSelector = fields.string("link");
             resultAlbumArtistSelector = fields.string("album_artist");
         }
+        
+        YamlMapping searchFieldMap = yaml.yamlMapping("search_fields");
+        for (YamlNode key: searchFieldMap.keys())
+            searchFields.put(key.asScalar().value(), searchFieldMap.string(key));
+        
+        String formatName = yaml.string("format");
+        switch(formatName)
+        {
+            case "html":
+                parser = new HTMLParser(); break;
+            case "json":
+                parser = new JSONParser(); break;
+            default:
+                throw new IllegalArgumentException("Format " + formatName + " is unknown.");
+        }
     }
 
     @Override
-    protected List<Result> doSearch(String query) 
+    protected List<Result> doSearch(Query query) 
     {
         try 
         {
             URL url = fillURL(query);
-            Document doc = Jsoup.parse(url, 10000);
-            System.out.println("Document from " + name + ":" + doc.html());
-            Elements resultEles = doc.select(resultSelector);
-            return resultEles.stream()
-                    .map((ele)->parseResultElement(ele))
-                    .filter((res)->res!=null)
-                    .collect(Collectors.toList());
+            InputStream input = url.openStream();
+            return processResults(parser, input);
         } catch (IOException ex) 
         {
             Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
             return List.of();
+        } 
+    }
+
+    protected URL fillURL(Query query) throws MalformedURLException 
+    {
+//        URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query.getTextSearch(), Charset.forName("utf-8"))));
+        try
+        {
+            URIBuilder builder = new URIBuilder(rootURL.toURI());
+            if (query.getTextSearch() != null)
+            {
+                if (searchFields.containsKey("query"))
+                    builder.addParameter(searchFields.get("query"), transformSearchString(query.getTextSearch()));
+            }
+            if (searchFields.containsKey("secret"))
+            {
+                builder.addParameter(searchFields.get("secret"), SecretStore.get().getSecret(name));
+            }
+            return builder.build().toURL();
+        }
+        catch (URISyntaxException ex)
+        {
+            throw new MalformedURLException();
         }
     }
-
-    protected URL fillURL(String query) throws MalformedURLException 
+    
+    protected String transformSearchString(String search)
     {
-        URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
-        return url;
+        return search;
+    }
+    
+    protected <E> List<Result> processResults(Parser<?,E> parser, InputStream input)
+    {
+        Collection<E> resultEles = parser.getResults(input, resultSelector);
+        return resultEles.stream()
+            .map((ele)->parseResultElement(parser, ele))
+            .filter((res)->res!=null)
+            .collect(Collectors.toList());
     }
 
-    protected Result parseResultElement(Element ele)
+    protected <E> Result parseResultElement(Parser<?,E> parser, E ele)
     {
         try
         {
             Result res = new Result();
             // Artist
             if (resultArtistSelector != null)
-            {
-                Element artistEle = ele.selectFirst(resultArtistSelector);
-                if (artistEle != null)
-                    res.setArtist(artistEle.text());
-            }
+                res.setArtist(parser.getField(ele, resultArtistSelector));
             // Title
             if (resultTitleSelector != null)
-            {
-                Element titleEle = ele.selectFirst(resultTitleSelector);
-                if (titleEle != null)
-                    res.setTitle(titleEle.text());
-            }
+                res.setTitle(parser.getField(ele, resultTitleSelector));
             // Link
             if (resultLinkSelector != null)
-            {
-                Element linkEle = ele.selectFirst(resultLinkSelector);
-                if (linkEle != null)
-                {
-                    String link;
-                    if (linkEle.hasAttr("href"))
-                        link = linkEle.attr("href");
-                    else
-                        link = linkEle.text();
-                    
-                    URL url;
-                    if(!link.startsWith("http"))
-                        url = new URL(rootURL, link);
-                    else
-                        url = new URL(link);
-                    
-                    res.setLink(url);
-                }
-            }
+                res.setLink(parser.getURLField(ele, rootURL, resultLinkSelector));
 
             // Artist + Album
             if (resultAlbumArtistSelector != null)
-            {
-                Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
-                if (alArtEle != null)
-                {
-                    
-                }
-            }
+                res.setAlbumArtist(parser.getField(ele, resultAlbumArtistSelector));
             
             // Artist + Title
 
@@ -146,5 +166,4 @@ public class WebScraperSearcher extends Searcher
             return null;
         }
     }
-    
 }