python3.4
BeautifulSoup
from urllib.request import urlopen用于打开网页from urllib.error import HTTPError用于处理链接异常from bs4 import BeautifulSoup用于处理html文档import re用正则表达式匹配目标字符串
import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.MalformedURLException;import java.net.URL;public class Capture { public static void main(String[] args) throws MalformedURLException, IOException { String strUrl ='http://news.baidu.com/'; URL url =new URL(strUrl); HttpURLConnection httpConnection = (HttpURLConnection) url.openConnection(); InputStreamReader input = new InputStreamReader(httpConnection.getInputStream(),'utf-8'); BufferedReader bufferedReader = new BufferedReader(input); String line =''; StringBuilder stringBuilder = new StringBuilder(); while ((line =bufferedReader.readLine())!=null){ stringBuilder.append(line); } String string =stringBuilder.toString(); int begin =string.indexOf('