You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

151 lines
4.5 KiB

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package moe.nekojimi.musicsearcher.providers;
import com.amihaiemil.eoyaml.YamlMapping;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import moe.nekojimi.musicsearcher.Result;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author jim
*/
public class WebScraperSearcher extends Searcher
{
private String searchUrl;
private URL rootURL;
private String resultSelector;
private String resultArtistSelector;
private String resultTitleSelector;
private String resultLinkSelector;
private String resultAlbumArtistSelector;
public WebScraperSearcher(String name)
{
super(name);
}
public WebScraperSearcher(YamlMapping yaml) throws MalformedURLException
{
super(yaml);
searchUrl = yaml.string("search_url");
rootURL = fillURL("");
rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), "");
resultSelector = yaml.string("result_selector");
YamlMapping fields = yaml.yamlMapping("result_field_selectors");
if (fields != null)
{
resultArtistSelector = fields.string("artist");
resultTitleSelector = fields.string("title");
resultLinkSelector = fields.string("link");
resultAlbumArtistSelector = fields.string("album_artist");
}
}
@Override
protected List<Result> doSearch(String query)
{
try
{
URL url = fillURL(query);
Document doc = Jsoup.parse(url, 10000);
System.out.println("Document from " + name + ":" + doc.html());
Elements resultEles = doc.select(resultSelector);
return resultEles.stream()
.map((ele)->parseResultElement(ele))
.filter((res)->res!=null)
.collect(Collectors.toList());
} catch (IOException ex)
{
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
return List.of();
}
}
protected URL fillURL(String query) throws MalformedURLException
{
URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8"))));
return url;
}
protected Result parseResultElement(Element ele)
{
try
{
Result res = new Result();
// Artist
if (resultArtistSelector != null)
{
Element artistEle = ele.selectFirst(resultArtistSelector);
if (artistEle != null)
res.setArtist(artistEle.text());
}
// Title
if (resultTitleSelector != null)
{
Element titleEle = ele.selectFirst(resultTitleSelector);
if (titleEle != null)
res.setTitle(titleEle.text());
}
// Link
if (resultLinkSelector != null)
{
Element linkEle = ele.selectFirst(resultLinkSelector);
if (linkEle != null)
{
String link;
if (linkEle.hasAttr("href"))
link = linkEle.attr("href");
else
link = linkEle.text();
URL url;
if(!link.startsWith("http"))
url = new URL(rootURL, link);
else
url = new URL(link);
res.setLink(url);
}
}
// Artist + Album
if (resultAlbumArtistSelector != null)
{
Element alArtEle = ele.selectFirst(resultAlbumArtistSelector);
if (alArtEle != null)
{
}
}
// Artist + Title
return res;
}
catch (Exception ex)
{
Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex);
return null;
}
}
}