java jsoup 网络爬虫 学习例子(四)抓取网页连接插入mysql数据库_JAVA_编程开发_程序员俱乐部

中国优秀的程序员网站程序员频道CXYCLUB技术地图
热搜:
更多>>
 
您所在的位置: 程序员俱乐部 > 编程开发 > JAVA > java jsoup 网络爬虫 学习例子(四)抓取网页连接插入mysql数据库

java jsoup 网络爬虫 学习例子(四)抓取网页连接插入mysql数据库

 2016/5/12 5:33:50  InJavaWeTrust  程序员俱乐部  我要评论(0)

java jsoup 网络爬虫 学习例子(四) 抓取网页连接插入mysql数据库

?

class="java" name="code">package com.iteye.injavawetrust.jsoup;

import java.io.IOException;
import java.util.Iterator;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class GetLink {
	
	private JsoupUtil ju = JsoupUtil.getInstance();
	
	private DBUtil du = DBUtil.getInstance();
	
	private Link link = new Link();
	
	private String insertSql = "";
	
	public void getLink(String url) {
		try {
			Document document = Jsoup.connect(url).timeout(5000).get();
			Elements hrefs = document.select("a[href]");
			Iterator<Element> hrefIter = hrefs.iterator();
			while (hrefIter.hasNext()) {
				Element href = hrefIter.next();
				link.setId(ju.getUUID());
				link.setUrlName(href.text());
				link.setUrl(href.attr("href"));
				insertSql = ju.getInsertSql(link);
				du.insert(insertSql);
			}
			Elements srcs = document.select("img[src]");
			Iterator<Element> srcIter = srcs.iterator();
			while(srcIter.hasNext()){
				Element src = srcIter.next();
				link.setId(ju.getUUID());
				link.setUrlName(src.attr("alt"));
				link.setUrl(src.attr("src"));
				insertSql = ju.getInsertSql(link);
				du.insert(insertSql);
			}
			Elements opts = document.select("option[value]");
			Iterator<Element> optIter = opts.iterator();
			while(optIter.hasNext()){
				Element opt = optIter.next();
				link.setId(ju.getUUID());
				link.setUrlName(opt.text());
				link.setUrl(opt.attr("value"));
				insertSql = ju.getInsertSql(link);
				du.insert(insertSql);
			}
			Elements links = document.select("link[href]");
			Iterator<Element> linkIter = links.iterator();
			while(linkIter.hasNext()){
				Element li =  linkIter.next();
				link.setId(ju.getUUID());
				link.setUrlName(li.text());
				link.setUrl(li.attr("href"));
				insertSql = ju.getInsertSql(link);
				du.insert(insertSql);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) {
		new GetLink().getLink(Constants.URL);
	}

}


package com.iteye.injavawetrust.jsoup;

import java.io.Serializable;
import java.util.Date;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class Link implements Serializable{

	private static final long serialVersionUID = 1165098694307553167L;
	/**
	 * ID
	 */
	private String id;
	/**
	 * link name
	 */
	private String urlName;
	/**
	 * link url
	 */
	private String url;
	/**
	 * insert db date
	 */
	private Date date;

	public String getId() {
		return id;
	}

	public void setId(String id) {
		this.id = id;
	}

	public String getUrlName() {
		return urlName;
	}

	public void setUrlName(String urlName) {
		this.urlName = urlName;
	}

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public Date getDate() {
		return date;
	}

	public void setDate(Date date) {
		this.date = date;
	}
	
}


package com.iteye.injavawetrust.jsoup;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class DBUtil {

	private static Connection conn = null;
	private static Statement st = null;
	private static ResultSet rs = null;

	private DBUtil() {

	}

	private static final DBUtil instance = new DBUtil();

	public static DBUtil getInstance() {
		return instance;
	}

	/**
	 * 连接数据库
	 * 
	 * @return
	 */
	public Connection connection() {
		try {
			Class.forName(Constants.DRIVER);
		} catch (ClassNotFoundException e1) {
			e1.printStackTrace();
		}
		try {
			conn = DriverManager.getConnection(Constants.DBURL, Constants.USER,
					Constants.PASSWORD);
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return conn;
	}

	/**
	 * 关闭连接
	 * 
	 * @param rs
	 * @param st
	 * @param conn
	 */
	public void release(ResultSet rs, Statement st, Connection conn) {
		try {
			try {
				if (null != rs) {
					rs.close();
				}
			} catch (Exception e) {
				rs = null;
			}
			try {
				if (null != st) {
					st.close();
				}
			} catch (Exception e) {
				st = null;
			}
			try {
				if (null != conn) {
					conn.close();
				}
			} catch (Exception e) {
				conn = null;
			}
		} finally {
			rs = null;
			st = null;
			conn = null;
		}
	}
	
	/**
	 * 插入
	 * @param sql
	 */
	public void insert(String sql){
		try{
			DBUtil.getInstance().connection();
			st = conn.createStatement();
			st.execute(sql);
			DBUtil.getInstance().release(rs, st, conn);
		}catch(Exception e){
			e.printStackTrace();
		}
	}

}


package com.iteye.injavawetrust.jsoup;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class Constants {
	
	/**
	 * mysql 驱动
	 */
	public static final String DRIVER   = "com.mysql.jdbc.Driver";
	/**
	 * 链接
	 */
	public static final String DBURL    = "jdbc:mysql://localhost:3306/jsoupdb?useUnicode=true&amp;characterEncoding=utf-8";
	/**
	 * username
	 */
	public static final String USER     = "root";
	/**
	 * password
	 */
	public static final String PASSWORD = "root";
	/**
	 * 随便找的一个URL
	 */
	public static final String URL      = "http://www.hrbhuade.net/html/main/index.htm";

}


package com.iteye.injavawetrust.jsoup;

import java.util.UUID;

/**
 * 
 * @author InJavaWeTrust
 *
 */
public class JsoupUtil {

	private JsoupUtil() {

	}

	private static final JsoupUtil instance = new JsoupUtil();

	public static JsoupUtil getInstance() {
		return instance;
	}
	
	/**
	 * 得到UUID
	 * @return 32位UUID
	 */
	public String getUUID() {
		String s = UUID.randomUUID().toString();
		return s.substring(0, 8) + s.substring(9, 13) + s.substring(14, 18)
				+ s.substring(19, 23) + s.substring(24);
	}
	
	/**
	 * insert sql
	 * @param link Link obj
	 * @return sql
	 */
	public String getInsertSql(Link link) {
		return "insert into link (id, urlname, url, date) values ('"
				+ link.getId() + "','" + link.getUrlName() + "','"
				+ link.getUrl() + "',NOW())";
	}

}

?

link 表

?

DROP TABLE IF EXISTS `link`;
CREATE TABLE `link` (
  `id` varchar(32) NOT NULL,
  `urlname` varchar(200) DEFAULT NULL,
  `url` varchar(200) DEFAULT NULL,
  `date` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

?

?

?

发表评论
用户名: 匿名