目标网站,仅作为实验目的。
①爬取
姓氏
网站: https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&showPinyin=1
②爬取男生名字
网站:https://baijiahao.baidu.com/s?id=1744863812577130101&wfr=spider&for=pc
③爬取女生名字
网站:https://baijiahao.baidu.com/s?id=1743833274577209720&wfr=spider&for=pc
1.爬虫函数(使用转换流,输入输出流)
/**
*从网络中爬取数据,将数据拼接成字符串
* @param net 网址
* @return 爬取的数据
*/
public static String webCrawler(String net) throws IOException {
//拼接爬取到的数据
StringBuilder sb = new StringBuilder();
//创建一个url对象
URL url = new URL(net);
//网络连接
URLConnection conn = url.openConnection();
//读取数据
InputStreamReader isr = new InputStreamReader(conn.getInputStream());//转换流
int ch;
while ((ch = isr.read()) != -1){
sb.append((char) ch);
}
//释放资源
isr.close();
//将读取的数据进行返回
return sb.toString();
}
}
2.数据筛选函数(
正则表达式
)
/**
*根据正则表达式获取数据
* @param str 完整的字符串
* @param rule 正则表达式
* @return 姓氏
*/
private static ArrayList<String> getData(String str, String rule,int index) {
//存放数据
ArrayList<String> list = new ArrayList<>();
//获取编译器
Pattern compile = Pattern.compile(rule);
//使用编译器匹配字符串
Matcher matcher = compile.matcher(str);
while (matcher.find()){
String group = matcher.group(index);
list.add(group);
}
return list;
}
3.主函数
main
public class Test1 {
public static void main(String[] args) throws IOException {
//定义变量记录爬取目标的网址
String familyNameNet = "https://hanyu.baidu.com/shici/detail?from=aladdin&pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&showPinyin=1";
String boyName = "https://baijiahao.baidu.com/s?id=1744863812577130101&wfr=spider&for=pc";
String girlName = "https://baijiahao.baidu.com/s?id=1743833274577209720&wfr=spider&for=pc";
//爬取数据,把网址上所有的数据拼接成一个字符串
String family = webCrawler(familyNameNet);
String boy = webCrawler(boyName);
String girl = webCrawler(girlName);
//使用正则表达式,筛选数据
ArrayList<String> familyNameTemp = getData(family, "(.{4})(,|。)", 1);
System.out.println(familyNameTemp);
}
使用集合(
ArrayList
)存储
使用正则表达式匹配汉字
ArrayList<String> boyNameTemp = getData(boy, "([\\u4E00-\\u9FA5]{2})(、|。)", 1);
System.out.println(boyNameTemp);
效果:
ArrayList<String> girlNameTemp = getData(girl, "([\\u4E00-\\u9FA5]{2})(、|。)", 1);
System.out.println(girlNameTemp);
效果:
//处理男生名字
//去除重复元素
ArrayList<String> boyList = new ArrayList<>();
for (String str : boyNameTemp) {
if (!boyList.contains(str)){
boyList.add(str);
}
}
System.out.println(boyList);
//处理男生名字
//去除重复元素
ArrayList<String> girlList = new ArrayList<>();
for (String str : girlNameTemp) {
if (!girlList.contains(str)){
girlList.add(str);
}
}
System.out.println(girlList);
拼接成指定集合元素的格式:“
张三-性别-年龄
”
/**
* 作用:
* 获取男生和女生的信息:张三-男-23
*
* @param familyList 参数一:装着姓氏的集合
* @param boyList 参数二:装着男生名字的集合
* @param girlList 参数三:装着女生名字的集合
* @param boyCnt 参数四:男生的个数
* @param girlCnt 参数五:女生的个数
* @return
*/
public static ArrayList<String> getInfos(ArrayList<String> familyList, ArrayList<String> boyList, ArrayList<String> girlList, int boyCnt, int girlCnt) {
//生成不重复的名字
//男生
HashSet<String> boyhs = new HashSet<>();
while (true) {
if (boyhs.size() == boyCnt) {
break;
}
//随机生成
Collections.shuffle(familyList);
Collections.shuffle(boyList);
boyhs.add(familyList.get(0) + boyList.get(0));
}
//生成女生
HashSet<String> girlhs = new HashSet<>();
while (true) {
if (girlhs.size() == girlCnt) {
break;
}
//随机生成
Collections.shuffle(familyList);
Collections.shuffle(girlList);
girlhs.add(familyList.get(0) + girlList.get(0));
}
//最终格式;张三-男-21
ArrayList<String> list = new ArrayList<>();
Random random = new Random();
//添加男生:年龄要求在18到27岁
for (String boyName : boyhs) {
int age = random.nextInt(10) + 18;
list.add(boyName + "-男-" + age);
}
//添加女生:年龄要求在18到25岁
for (String girlName : girlhs) {
int age = random.nextInt(8) + 18;
list.add(girlName + "-女-" + age);
}
return list;
}
主函数添加代码:
ArrayList<String> infos = getInfos(familyList, boyList, girlList, 10, 10);
//打乱集合顺序
Collections.shuffle(infos);
System.out.println(infos);
效果:
//写出数据
BufferedWriter bw = new BufferedWriter(new FileWriter("G:\\JavaReview\\day33\\names.txt"));
for (String info : infos) {
bw.write(info);
bw.newLine();
}
bw.close();
查看效果: