Made WebScraperSearcher able to parse different result formats (HTML/JSON) and supply query parameters dynamically.

master
Nekojimi 3 years ago
parent 039a91ed40
commit b8336c26fb
  1. 70
      src/main/java/moe/nekojimi/musicsearcher/parsers/HTMLParser.java
  2. 72
      src/main/java/moe/nekojimi/musicsearcher/parsers/JSONParser.java
  3. 30
      src/main/java/moe/nekojimi/musicsearcher/parsers/Parser.java
  4. 10
      src/main/java/moe/nekojimi/musicsearcher/providers/Searcher.java
  5. 151
      src/main/java/moe/nekojimi/musicsearcher/providers/WebScraperSearcher.java

@ -0,0 +1,70 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package moe.nekojimi.musicsearcher.parsers;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class HTMLParser extends Parser<Document, Element>
{
@Override
public Document getDocument(InputStream input)
{
try
{
Document doc = Jsoup.parse(new String(input.readAllBytes()));
// System.out.println(doc.outerHtml());
return doc;
}
catch (IOException ex)
{
Logger.getLogger(HTMLParser.class.getName()).log(Level.SEVERE, null, ex);
return null;
}
}
@Override
public Collection<Element> getResults(Document document, String selector)
{
return document.select(selector);
}
@Override
public String getField(Element object, String selector)
{
Element ele = object.selectFirst(selector);
return ele.text();
}
@Override
public URL getURLField(Element object, URL baseURL, String selector) throws MalformedURLException
{
Element ele = object.selectFirst(selector);
String link;
if (ele.hasAttr("href"))
link = ele.attr("href");
else
link = ele.text();
URL url;
if(!link.startsWith("http"))
url = new URL(baseURL, link);
else
url = new URL(link);
return url;
}
}

@ -0,0 +1,72 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package moe.nekojimi.musicsearcher.parsers;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.List;
import javax.json.Json;
import javax.json.JsonObject;
import javax.json.JsonReader;
import javax.json.JsonStructure;
import javax.json.JsonValue;
/**
*
* @author jim
*/
public class JSONParser extends Parser<JsonStructure, JsonObject>
{
@Override
public JsonStructure getDocument(InputStream input)
{
JsonReader reader = Json.createReader(input);
return reader.read();
}
@Override
public Collection<JsonObject> getResults(JsonStructure document, String selector)
{
JsonValue value = document.getValue(selector);
if (value.getValueType() == JsonValue.ValueType.ARRAY)
{
return value.asJsonArray().getValuesAs(JsonObject.class);
}
else if (value.getValueType() == JsonValue.ValueType.OBJECT)
{
return List.of(value.asJsonObject());
}
return List.of();
}
@Override
public String getField(JsonObject object, String selector)
{
return object.getValue(selector).toString();
}
@Override
public URL getURLField(JsonObject object, URL baseURL, String selector) throws MalformedURLException
{
String link = getField(object, selector);
URL url;
if(!link.startsWith("http"))
url = new URL(baseURL, link);
else
url = new URL(link);
return url;
}
private JsonValue navigate(JsonObject from, String selector)
{
return from.getValue(selector);
}
}

@ -0,0 +1,30 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package moe.nekojimi.musicsearcher.parsers;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
/**
*
* @author jim
*/
public abstract class Parser<D,O>
{
public abstract D getDocument(InputStream input);
public abstract Collection<O> getResults(D document, String selector);
public Collection<O> getResults(InputStream input, String selector)
{
return getResults(getDocument(input), selector);
}
public abstract String getField(O object, String selector);
public abstract URL getURLField(O object, URL baseURL, String selector) throws MalformedURLException;
}

@ -12,6 +12,8 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import moe.nekojimi.musicsearcher.Query;
import moe.nekojimi.musicsearcher.QueryFieldUnsupportedException;
import moe.nekojimi.musicsearcher.Result;
/**
@ -40,7 +42,7 @@ public abstract class Searcher
return name;
}
public List<Result> searchAndWait(String query) throws InterruptedException, ExecutionException
public List<Result> searchAndWait(Query query) throws InterruptedException, ExecutionException
{
try
{
@ -51,19 +53,19 @@ public abstract class Searcher
}
}
public List<Result> searchAndWait(String query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
public List<Result> searchAndWait(Query query, long limit, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException
{
return search(query).get(limit, unit);
}
public CompletableFuture<List<Result>> search(String query)
public CompletableFuture<List<Result>> search(Query query)
{
CompletableFuture<List<Result>> future = new CompletableFuture<>();
future.completeAsync(()->doSearch(query), executor);
return future;
}
protected abstract List<Result> doSearch(String query);
protected abstract List<Result> doSearch(Query query) throws QueryFieldUnsupportedException;
@Override
public String toString() {

@ -6,20 +6,26 @@
package moe.nekojimi.musicsearcher.providers;
import com.amihaiemil.eoyaml.YamlMapping;
import com.amihaiemil.eoyaml.YamlNode;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import moe.nekojimi.musicsearcher.Query;
import moe.nekojimi.musicsearcher.Result;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import moe.nekojimi.musicsearcher.SecretStore;
import moe.nekojimi.musicsearcher.parsers.HTMLParser;
import moe.nekojimi.musicsearcher.parsers.JSONParser;
import moe.nekojimi.musicsearcher.parsers.Parser;
import org.apache.http.client.utils.URIBuilder;
/**
*
@ -27,14 +33,18 @@ import org.jsoup.select.Elements;
*/
public class WebScraperSearcher extends Searcher
{
private String searchUrl;
private URL rootURL;
protected String searchUrl;
protected URL rootURL;
private String resultSelector;
private String resultArtistSelector;
private String resultTitleSelector;
private String resultLinkSelector;
private String resultAlbumArtistSelector;
protected String resultSelector;
protected String resultArtistSelector;
protected String resultTitleSelector;
protected String resultLinkSelector;
protected String resultAlbumArtistSelector;
protected Map<String,String> searchFields = new HashMap<>();
protected Parser<?,?> parser;
public WebScraperSearcher(String name)
{
@ -45,11 +55,12 @@ public class WebScraperSearcher extends Searcher
{
super(yaml);
searchUrl = yaml.string("search_url");
rootURL = fillURL("");
rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
rootURL = new URL(searchUrl);
// rootURL = fillURL(Query.fullText(""));
// rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
resultSelector = yaml.string("result_selector");
YamlMapping fields = yaml.yamlMapping("result_field_selectors");
YamlMapping fields = yaml.yamlMapping("result_fields");
if (fields != null)
{
resultArtistSelector = fields.string("artist");
@ -57,84 +68,93 @@ public class WebScraperSearcher extends Searcher
resultLinkSelector = fields.string("link");
resultAlbumArtistSelector = fields.string("album_artist");
}
YamlMapping searchFieldMap = yaml.yamlMapping("search_fields");
for (YamlNode key: searchFieldMap.keys())
searchFields.put(key.asScalar().value(), searchFieldMap.string(key));
String formatName = yaml.string("format");
switch(formatName)
{
case "html":
parser = new HTMLParser(); break;
case "json":
parser = new JSONParser(); break;
default:
throw new IllegalArgumentException("Format " + formatName + " is unknown.");
}
}
@Override
protected List<Result> doSearch(String query)
protected List<Result> doSearch(Query query)
{
try
{
URL url = fillURL(query);
Document doc = Jsoup.parse(url, 10000);
System.out.println("Document from " + name + ":" + doc.html());
Elements resultEles = doc.select(resultSelector);
return resultEles.stream()
.map((ele)->parseResultElement(ele))
.filter((res)->res!=null)
.collect(Collectors.toList());
InputStream input = url.openStream();
return processResults(parser, input);
} catch (IOException ex)
{
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
return List.of();
}
}
}
protected URL fillURL(String query) throws MalformedURLException
protected URL fillURL(Query query) throws MalformedURLException
{
URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
return url;
// URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query.getTextSearch(), Charset.forName("utf-8"))));
try
{
URIBuilder builder = new URIBuilder(rootURL.toURI());
if (query.getTextSearch() != null)
{
if (searchFields.containsKey("query"))
builder.addParameter(searchFields.get("query"), transformSearchString(query.getTextSearch()));
}
if (searchFields.containsKey("secret"))
{
builder.addParameter(searchFields.get("secret"), SecretStore.get().getSecret(name));
}
return builder.build().toURL();
}
catch (URISyntaxException ex)
{
throw new MalformedURLException();
}
}
protected String transformSearchString(String search)
{
return search;
}
protected <E> List<Result> processResults(Parser<?,E> parser, InputStream input)
{
Collection<E> resultEles = parser.getResults(input, resultSelector);
return resultEles.stream()
.map((ele)->parseResultElement(parser, ele))
.filter((res)->res!=null)
.collect(Collectors.toList());
}
protected Result parseResultElement(Element ele)
protected <E> Result parseResultElement(Parser<?,E> parser, E ele)
{
try
{
Result res = new Result();
// Artist
if (resultArtistSelector != null)
{
Element artistEle = ele.selectFirst(resultArtistSelector);
if (artistEle != null)
res.setArtist(artistEle.text());
}
res.setArtist(parser.getField(ele, resultArtistSelector));
// Title
if (resultTitleSelector != null)
{
Element titleEle = ele.selectFirst(resultTitleSelector);
if (titleEle != null)
res.setTitle(titleEle.text());
}
res.setTitle(parser.getField(ele, resultTitleSelector));
// Link
if (resultLinkSelector != null)
{
Element linkEle = ele.selectFirst(resultLinkSelector);
if (linkEle != null)
{
String link;
if (linkEle.hasAttr("href"))
link = linkEle.attr("href");
else
link = linkEle.text();
URL url;
if(!link.startsWith("http"))
url = new URL(rootURL, link);
else
url = new URL(link);
res.setLink(url);
}
}
res.setLink(parser.getURLField(ele, rootURL, resultLinkSelector));
// Artist + Album
if (resultAlbumArtistSelector != null)
{
Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
if (alArtEle != null)
{
}
}
res.setAlbumArtist(parser.getField(ele, resultAlbumArtistSelector));
// Artist + Title
@ -146,5 +166,4 @@ public class WebScraperSearcher extends Searcher
return null;
}
}
}

Loading…
Cancel
Save