From b8336c26fbdc24b3869bb0479f8bc1c299df1618 Mon Sep 17 00:00:00 2001 From: Nekojimi Date: Thu, 30 Sep 2021 17:07:22 +0100 Subject: [PATCH] Made WebScraperSearcher able to parse different result formats (HTML/JSON) and supply query parameters dynamically. --- .../musicsearcher/parsers/HTMLParser.java | 70 ++++++++ .../musicsearcher/parsers/JSONParser.java | 72 +++++++++ .../musicsearcher/parsers/Parser.java | 30 ++++ .../musicsearcher/providers/Searcher.java | 10 +- .../providers/WebScraperSearcher.java | 151 ++++++++++-------- 5 files changed, 263 insertions(+), 70 deletions(-) create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java create mode 100644 src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java new file mode 100644 index 0000000..446a420 --- /dev/null +++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java @@ -0,0 +1,70 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package moe.nekojimi.musicsearcher.parsers; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + + +public class HTMLParser extends Parser +{ + @Override + public Document getDocument(InputStream input) + { + try + { + Document doc = Jsoup.parse(new String(input.readAllBytes())); +// System.out.println(doc.outerHtml()); + return doc; + } + catch (IOException ex) + { + Logger.getLogger(HTMLParser.class.getName()).log(Level.SEVERE, null, ex); + return null; + } + } + + @Override + public Collection getResults(Document document, String selector) + { + return document.select(selector); + } + + @Override + public String getField(Element object, String selector) + { + Element ele = object.selectFirst(selector); + return ele.text(); + } + + @Override + public URL getURLField(Element object, URL baseURL, String selector) throws MalformedURLException + { + Element ele = object.selectFirst(selector); + + String link; + if (ele.hasAttr("href")) + link = ele.attr("href"); + else + link = ele.text(); + + URL url; + if(!link.startsWith("http")) + url = new URL(baseURL, link); + else + url = new URL(link); + + return url; + } +} diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java new file mode 100644 index 0000000..cbca262 --- /dev/null +++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java @@ -0,0 +1,72 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package moe.nekojimi.musicsearcher.parsers; + +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.List; +import javax.json.Json; +import javax.json.JsonObject; +import javax.json.JsonReader; +import javax.json.JsonStructure; +import javax.json.JsonValue; + +/** + * + * @author jim + */ +public class JSONParser extends Parser +{ + + @Override + public JsonStructure getDocument(InputStream input) + { + JsonReader reader = Json.createReader(input); + return reader.read(); + } + + @Override + public Collection getResults(JsonStructure document, String selector) + { + JsonValue value = document.getValue(selector); + if (value.getValueType() == JsonValue.ValueType.ARRAY) + { + return value.asJsonArray().getValuesAs(JsonObject.class); + } + else if (value.getValueType() == JsonValue.ValueType.OBJECT) + { + return List.of(value.asJsonObject()); + } + return List.of(); + } + + @Override + public String getField(JsonObject object, String selector) + { + return object.getValue(selector).toString(); + } + + @Override + public URL getURLField(JsonObject object, URL baseURL, String selector) throws MalformedURLException + { + String link = getField(object, selector); + + URL url; + if(!link.startsWith("http")) + url = new URL(baseURL, link); + else + url = new URL(link); + + return url; + } + + private JsonValue navigate(JsonObject from, String selector) + { + return from.getValue(selector); + } +} diff --git a/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java b/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java new file mode 100644 index 0000000..52ff0bd --- /dev/null +++ b/src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java @@ -0,0 +1,30 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package moe.nekojimi.musicsearcher.parsers; + +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; + +/** + * + * @author jim + */ +public abstract class Parser +{ + public abstract D getDocument(InputStream input); + + public abstract Collection getResults(D document, String selector); + public Collection getResults(InputStream input, String selector) + { + return getResults(getDocument(input), selector); + } + + public abstract String getField(O object, String selector); + public abstract URL getURLField(O object, URL baseURL, String selector) throws MalformedURLException; + +} diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java index 179f42f..3763ff5 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java +++ b/src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java @@ -12,6 +12,8 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import moe.nekojimi.musicsearcher.Query; +import moe.nekojimi.musicsearcher.QueryFieldUnsupportedException; import moe.nekojimi.musicsearcher.Result; /** @@ -40,7 +42,7 @@ public abstract class Searcher return name; } - public List searchAndWait(String query) throws InterruptedException, ExecutionException + public List searchAndWait(Query query) throws InterruptedException, ExecutionException { try { @@ -51,19 +53,19 @@ public abstract class Searcher } } - public List searchAndWait(String query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException + public List searchAndWait(Query query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { return search(query).get(limit, unit); } - public CompletableFuture> search(String query) + public CompletableFuture> search(Query query) { CompletableFuture> future = new CompletableFuture<>(); future.completeAsync(()->doSearch(query), executor); return future; } - protected abstract List doSearch(String query); + protected abstract List doSearch(Query query) throws QueryFieldUnsupportedException; @Override public String toString() { diff --git a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java index 79b736b..748d21e 100644 --- a/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java +++ b/src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java @@ -6,20 +6,26 @@ package moe.nekojimi.musicsearcher.providers; import com.amihaiemil.eoyaml.YamlMapping; +import com.amihaiemil.eoyaml.YamlNode; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; +import java.net.URISyntaxException; import java.net.URL; -import java.net.URLEncoder; -import java.nio.charset.Charset; +import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import moe.nekojimi.musicsearcher.Query; import moe.nekojimi.musicsearcher.Result; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; +import moe.nekojimi.musicsearcher.SecretStore; +import moe.nekojimi.musicsearcher.parsers.HTMLParser; +import moe.nekojimi.musicsearcher.parsers.JSONParser; +import moe.nekojimi.musicsearcher.parsers.Parser; +import org.apache.http.client.utils.URIBuilder; /** * @@ -27,14 +33,18 @@ import org.jsoup.select.Elements; */ public class WebScraperSearcher extends Searcher { - private String searchUrl; - private URL rootURL; + protected String searchUrl; + protected URL rootURL; - private String resultSelector; - private String resultArtistSelector; - private String resultTitleSelector; - private String resultLinkSelector; - private String resultAlbumArtistSelector; + protected String resultSelector; + protected String resultArtistSelector; + protected String resultTitleSelector; + protected String resultLinkSelector; + protected String resultAlbumArtistSelector; + + protected Map searchFields = new HashMap<>(); + + protected Parser parser; public WebScraperSearcher(String name) { @@ -45,11 +55,12 @@ public class WebScraperSearcher extends Searcher { super(yaml); searchUrl = yaml.string("search_url"); - rootURL = fillURL(""); - rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), ""); + rootURL = new URL(searchUrl); +// rootURL = fillURL(Query.fullText("")); +// rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), ""); resultSelector = yaml.string("result_selector"); - YamlMapping fields = yaml.yamlMapping("result_field_selectors"); + YamlMapping fields = yaml.yamlMapping("result_fields"); if (fields != null) { resultArtistSelector = fields.string("artist"); @@ -57,84 +68,93 @@ public class WebScraperSearcher extends Searcher resultLinkSelector = fields.string("link"); resultAlbumArtistSelector = fields.string("album_artist"); } + + YamlMapping searchFieldMap = yaml.yamlMapping("search_fields"); + for (YamlNode key: searchFieldMap.keys()) + searchFields.put(key.asScalar().value(), searchFieldMap.string(key)); + + String formatName = yaml.string("format"); + switch(formatName) + { + case "html": + parser = new HTMLParser(); break; + case "json": + parser = new JSONParser(); break; + default: + throw new IllegalArgumentException("Format " + formatName + " is unknown."); + } } @Override - protected List doSearch(String query) + protected List doSearch(Query query) { try { URL url = fillURL(query); - Document doc = Jsoup.parse(url, 10000); - System.out.println("Document from " + name + ":" + doc.html()); - Elements resultEles = doc.select(resultSelector); - return resultEles.stream() - .map((ele)->parseResultElement(ele)) - .filter((res)->res!=null) - .collect(Collectors.toList()); + InputStream input = url.openStream(); + return processResults(parser, input); } catch (IOException ex) { Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex); return List.of(); - } + } } - protected URL fillURL(String query) throws MalformedURLException + protected URL fillURL(Query query) throws MalformedURLException { - URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8")))); - return url; +// URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query.getTextSearch(), Charset.forName("utf-8")))); + try + { + URIBuilder builder = new URIBuilder(rootURL.toURI()); + if (query.getTextSearch() != null) + { + if (searchFields.containsKey("query")) + builder.addParameter(searchFields.get("query"), transformSearchString(query.getTextSearch())); + } + if (searchFields.containsKey("secret")) + { + builder.addParameter(searchFields.get("secret"), SecretStore.get().getSecret(name)); + } + return builder.build().toURL(); + } + catch (URISyntaxException ex) + { + throw new MalformedURLException(); + } + } + + protected String transformSearchString(String search) + { + return search; + } + + protected List processResults(Parser parser, InputStream input) + { + Collection resultEles = parser.getResults(input, resultSelector); + return resultEles.stream() + .map((ele)->parseResultElement(parser, ele)) + .filter((res)->res!=null) + .collect(Collectors.toList()); } - protected Result parseResultElement(Element ele) + protected Result parseResultElement(Parser parser, E ele) { try { Result res = new Result(); // Artist if (resultArtistSelector != null) - { - Element artistEle = ele.selectFirst(resultArtistSelector); - if (artistEle != null) - res.setArtist(artistEle.text()); - } + res.setArtist(parser.getField(ele, resultArtistSelector)); // Title if (resultTitleSelector != null) - { - Element titleEle = ele.selectFirst(resultTitleSelector); - if (titleEle != null) - res.setTitle(titleEle.text()); - } + res.setTitle(parser.getField(ele, resultTitleSelector)); // Link if (resultLinkSelector != null) - { - Element linkEle = ele.selectFirst(resultLinkSelector); - if (linkEle != null) - { - String link; - if (linkEle.hasAttr("href")) - link = linkEle.attr("href"); - else - link = linkEle.text(); - - URL url; - if(!link.startsWith("http")) - url = new URL(rootURL, link); - else - url = new URL(link); - - res.setLink(url); - } - } + res.setLink(parser.getURLField(ele, rootURL, resultLinkSelector)); // Artist + Album if (resultAlbumArtistSelector != null) - { - Element alArtEle = ele.selectFirst(resultAlbumArtistSelector); - if (alArtEle != null) - { - - } - } + res.setAlbumArtist(parser.getField(ele, resultAlbumArtistSelector)); // Artist + Title @@ -146,5 +166,4 @@ public class WebScraperSearcher extends Searcher return null; } } - }