/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package moe.nekojimi.musicsearcher.providers; import com.amihaiemil.eoyaml.YamlMapping; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.nio.charset.Charset; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import moe.nekojimi.musicsearcher.Result; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * * @author jim */ public class WebScraperSearcher extends Searcher { private String searchUrl; private URL rootURL; private String resultSelector; private String resultArtistSelector; private String resultTitleSelector; private String resultLinkSelector; private String resultAlbumArtistSelector; public WebScraperSearcher(String name) { super(name); } public WebScraperSearcher(YamlMapping yaml) throws MalformedURLException { super(yaml); searchUrl = yaml.string("search_url"); rootURL = fillURL(""); rootURL = new URL(rootURL.getProtocol(), rootURL.getHost(), ""); resultSelector = yaml.string("result_selector"); YamlMapping fields = yaml.yamlMapping("result_field_selectors"); if (fields != null) { resultArtistSelector = fields.string("artist"); resultTitleSelector = fields.string("title"); resultLinkSelector = fields.string("link"); resultAlbumArtistSelector = fields.string("album_artist"); } } @Override protected List doSearch(String query) { try { URL url = fillURL(query); Document doc = Jsoup.parse(url, 10000); System.out.println("Document from " + name + ":" + doc.html()); Elements resultEles = doc.select(resultSelector); return resultEles.stream() .map((ele)->parseResultElement(ele)) .filter((res)->res!=null) .collect(Collectors.toList()); } catch (IOException ex) { Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex); return List.of(); } } protected URL fillURL(String query) throws MalformedURLException { URL url = new URL(searchUrl.replaceAll("\\$QUERY", URLEncoder.encode(query, Charset.forName("utf-8")))); return url; } protected Result parseResultElement(Element ele) { try { Result res = new Result(); // Artist if (resultArtistSelector != null) { Element artistEle = ele.selectFirst(resultArtistSelector); if (artistEle != null) res.setArtist(artistEle.text()); } // Title if (resultTitleSelector != null) { Element titleEle = ele.selectFirst(resultTitleSelector); if (titleEle != null) res.setTitle(titleEle.text()); } // Link if (resultLinkSelector != null) { Element linkEle = ele.selectFirst(resultLinkSelector); if (linkEle != null) { String link; if (linkEle.hasAttr("href")) link = linkEle.attr("href"); else link = linkEle.text(); URL url; if(!link.startsWith("http")) url = new URL(rootURL, link); else url = new URL(link); res.setLink(url); } } // Artist + Album if (resultAlbumArtistSelector != null) { Element alArtEle = ele.selectFirst(resultAlbumArtistSelector); if (alArtEle != null) { } } // Artist + Title return res; } catch (Exception ex) { Logger.getLogger(WebScraperSearcher.class.getName()).log(Level.SEVERE, null, ex); return null; } } }