一、网络爬取
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) throws CloneNotSupportedException, IOException {
//创建URL对象
URL url=new URL("https://mbd.baidu.com/newspage/data/landingsuper?context=%7B%22nid%22%3A%22news_9556566747732783179%22%7D&n_type=-1&p_from=-1");
//连接网络
URLConnection conn=url.openConnection();
//创建对象去读取网络中的数据
BufferedReader br=new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line;
//获取正则表达式的对象 pattern
String regex=""; //正则表达式
Pattern pattern =Pattern.compile(regex);
while((line=br.readLine())!=null){
//拿着文本匹配器的对象matcher按照pattern的规则去读取当前的这一行信息
Matcher matcher=pattern.matcher(line);
while(matcher.find()) {
System.out.println(matcher.group());
}
}
br.close();
}
}
二、本地爬取
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) throws CloneNotSupportedException, IOException {
//爬取文本中对应数据:电话、邮箱、手机号、热线
//手机号
String regex1="1[3-9]\\d{9}";
//邮箱
String regex2="\\w+@[\\w&&[^_]]{2,6}(\\.[a-zA-Z]{2,3}){1,2}";
//座机
String regex3="0\\d{2,3}-?[1-9]\\d{4,9}";
//热线电话
String regex4="400-?[1-9]\\d{2}-?[1-9]\\d{3}";
//正则表达式整合
String regex5="(1[3-9]\\d{9})|(\\w+@[\\w&&[^_]]{2,6}(\\.[a-zA-Z]{2,3}){1,2})"+
"|(0\\d{2,3}-?[1-9]\\d{4,9})|(400-?[1-9]\\d{2}-?[1-9]\\d{3})";
String s="来黑马程序员学习Java,"+
"电话:18512516758,18512508907"+
"或者联系邮箱:boniu@itcast.cn,"+
"座机电话:01036517895,010-98951256"+
"邮箱:bozai@itcast.cn, 热线电话:400-618-9090,400-618-4000,4006184000,4006189090";
Pattern p=Pattern.compile(regex5);
Matcher m=p.matcher(s);
while(m.find()){
String str=m.group();
System.out.println(str);
}
}
}