Java crawling website data
•
Java
Here is the programming house jb51 CC collects and arranges code fragments through the network.
Programming house Xiaobian now shares it with you and gives you a reference.
package com.zzger.model; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.CountDownLatch; import com.zzger.module.queue.UrlQueue; import com.zzger.util.HttpUtils; import com.zzger.util.RegexUtils; public class WebSite { /** * 站点url */ private String url; /** * 需要爬行的url队列 */ private UrlQueue<String> urls = new UrlQueue<>(); /** * 已爬行过的页面url */ private List<String> exitUrls = Collections.synchronizedList(new ArrayList<>()); private static final int TOTAL_THREADS = 12; private final CountDownLatch mStartSignal = new CountDownLatch(1); private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS); public WebSite(String url){ this.url = url; urls.offer(url);//把网站首页加入需要爬行的队列中 } public void guangDu(){ new Thread(new Runnable() { @Override public void run() { paxing(HttpUtils.httpGet(url)); } }).start(); } public void paxing(String html){ if(html.lastIndexOf("下一页</a></li></ul></div>")<0) return ; String strList = html.substring(html.indexOf("<li class=\\"next-page\\">"),html.lastIndexOf("下一页</a></li></ul></div>")); String url = RegexUtils.RegexString("<a href=\\"(.+?)\\"",strList); if(url.equals("Nothing")) return ; urls.put(url);//把url存储到队列中 paxing(HttpUtils.httpGet(url)); } public void dxcPx(){ Page<DuanZi> page = new Gxpage(urls.take()); List<Section<DuanZi>> list = page.ybhqSection().getSections(); for(Section<DuanZi> section : list){ new Thread(new Runnable() { @Override public void run() { mStartSignal.countDown();// 计数减一为0,工作线程真正启动具体操作 try { mStartSignal.await();// 阻塞,等待mStartSignal计数为0运行后面的代码 // 所有的工作线程都在等待同一个启动的命令 } catch (InterruptedException e) { e.printStackTrace(); } DuanZi duanzi = section.select().getModel(); System.out.println(duanzi.getTitle()); mDoneSignal.countDown();// 完成以后计数减一 } } ).start(); } try { mDoneSignal.await();// 等待所有工作线程结束 } catch (InterruptedException e) { e.printStackTrace(); } dxcPx();//线程任务执行完后,再次获取url队列进行任务 } public static void main(String[] args) { WebSite web = new WebSite("http://duanziwang.com"); web.guangDu(); for(int i = 0; i<10;i++){ new Thread(new Runnable() { @Override public void run() { web.dxcPx(); } }).start(); } } }
The above is all the code content collected by the programming home (jb51. CC). I hope this article can help you solve the program development problems you encounter.
If you think the content of the programming home website is good, you are welcome to recommend the programming home website to programmers and friends.
The content of this article comes from the network collection of netizens. It is used as a learning reference. The copyright belongs to the original author.
THE END
二维码