package Github.lz632339942.io;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class fnpc {
public static void main(String[] args) throws Exception {
fnpc a = new fnpc();
a.getHtml();
}
@SuppressWarnings({ "rawtypes", "unchecked" })
public void getHtml() throws Exception{
ImgDown xz = new ImgDown();
HashSet pdcm = new HashSet();
Document doc = Jsoup.connect("https://photo.fengniao.com/f_595.html").get();
//获取元素节点等,观察爬取网页的HTML源代码
List<Element> infoListEle = doc.getElementsByAttributeValue("class", "noRight");
infoListEle.forEach(element -> {
System.out.println(element.getElementsByAttributeValue("class", "txtBox overOneTxt")
.first().getElementsByAttributeValue("class", "title").text());
String title = element.getElementsByAttributeValue("class", "txtBox overOneTxt").first()
.getElementsByAttributeValue("class", "title").text();
//这里判断名字是否重复,重复会在后面加一个6位的随机数,博客书写需要
if (!(pdcm.add(title)))title = title + "NO."+(int)(Math.random() * 1000000);
//去除名字中的“:”符号,博客书写需要
title = title.replace(":", "");
System.out.println(element.getElementsByAttributeValue("class", "txtBox overOneTxt")
.first().getElementsByAttributeValue("class", "peopleName overOneTxt").text());
String zz = element.getElementsByAttributeValue("class", "txtBox overOneTxt")
.first().getElementsByAttributeValue("class", "peopleName overOneTxt").text();
System.out.println(element.getElementsByAttributeValue("class", "pic").attr("style"));
String url= element.getElementsByAttributeValue("class", "pic").attr("style");
//由于获取的图片链接中包括background-image: url('')和一些后缀,前后格式固定,所以直接提取中间的url
url = url.substring(23, url.length()-43);
//将得到的名字作者以及图片url处理成博客需要的格式
//String Makedown = title+":\n full_link: "+url+"\n thumb_link: "+url+"\n descr: "+zz;
String Makedown = title+":\n full_link: /images/tk"+url.substring(url.lastIndexOf("/"))
+"\n thumb_link: /images/tk"+url.substring(url.lastIndexOf("/"))+"\n descr: "+zz;
//将内容写到文件里面
writeFile("D:\\HEXO\\HEXO\\source\\_data\\gallery.yml",Makedown);
//下载图片,另外写了一个类 ImgDown
xz.downImages("D:\\HEXO\\HEXO\\themes\\hexo-theme-Mic_Theme\\source\\images\\tk", url);
});
}
//此处作用是把处理后的字符串写入文件
public static void writeFile(String fileFullPath,String content) {
//刚开始FileOutputStream中文会乱码,所以换成OutputStreamWriter
//FileOutputStream fos = null;
OutputStreamWriter oStreamWriter = null;
try {
//true不覆盖已有内容
//fos = new FileOutputStream(fileFullPath, true);
oStreamWriter = new OutputStreamWriter(new FileOutputStream(fileFullPath, true), "utf-8");
//写入
//fos.write(content.getBytes());
oStreamWriter.append(content);
// 写入一个换行
//fos.write("\r\n\r\n".getBytes());
oStreamWriter.append("\r\n\r\n");
}catch (IOException e) {
e.printStackTrace();
}finally{
if(oStreamWriter != null){
try {
//fos.flush();
oStreamWriter.flush();
//fos.close();
oStreamWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
package Github.lz632339942.io;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class ImgDown {
/**
* 根据图片的URL下载的图片到本地的filePath
* @param filePath 文件夹
* @param imageUrl 图片的网址
*/
public void downImages(String filePath,String imageUrl){
// 截取图片的名称
String fileName = imageUrl.substring(imageUrl.lastIndexOf("/"));
//创建文件的目录结构
File files = new File(filePath);
// 判断文件夹是否存在,如果不存在就创建一个文件夹
if(!files.exists()){
files.mkdirs();
}
try {
URL url = new URL(imageUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
InputStream is = connection.getInputStream();
// 创建文件
File file = new File(filePath+fileName);
FileOutputStream out = new FileOutputStream(file);
int i = 0;
while((i = is.read()) != -1){
out.write(i);
}
is.close();
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
package Github.lz632339942.io;
import java.util.List;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class JS_DT {
public static void main(String[] args) {
JS_DT Cs = new JS_DT();
Cs.test();
}
//这个是用来获取js动态网页内容的
public void test() {
//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
//当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnScriptError(false);
//当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setActiveXNative(false);
//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(false);
//很重要,启用JS
webClient.getOptions().setJavaScriptEnabled(true);
//很重要,设置支持AJAX
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
HtmlPage page = null;
try {
//尝试加载上面图片例子给出的网页
page = webClient.getPage("http://localhost:4000/gallery/");
} catch (Exception e) {
e.printStackTrace();
}finally {
webClient.close();
}
//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
webClient.waitForBackgroundJavaScript(30000);
//直接将加载完成的页面转换成xml格式的字符串
String pageXml = page.asXml();
//TODO 下面的代码就是对字符串的操作了,常规的爬虫操作,用到了比较好用的Jsoup库
//获取html文档
Document document = Jsoup.parse(pageXml);
System.out.println(document);
//获取元素节点等
List<Element> infoListEle = document.getElementsByAttributeValue("class", "photo_thumbnail jg-entry entry-visible");
infoListEle.forEach(element -> {
System.out.println(element.getElementsByTag("div").first().getElementsByTag("span").text());
System.out.println(element.getElementsByTag("a").first().getElementsByTag("img").attr("src"));
});
}
}