日期:2014-05-20  浏览次数:20854 次

大家看看我抓取当当网产品的正则表达式哪里不对
我这个代码是抓取当当网产品的产品名,图片,和价格
我的正则一次匹配产品名,图片,价格中的一个,我想把三个一次全部匹配了,求指点
Java code
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class test {
    public static void main(String[] args) {
        String url = "http://product.dangdang.com/product.aspx?product_id=20689512";
        new test().spiderProduct(url);
    }

    public void spiderProduct(String url) {
        String content = getURLContent(url, "gb2312");
        String regStr = "";
        regStr = "<h1>(.*?)</h1>";// 产品名称
        regStr = "src=\"(.*?b\\.jpg)\"";// 产品图片
        regStr = "class=\"num\".*?(\\d+\\.\\d+)";// 价格
        Pattern pattern = Pattern.compile(regStr);
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            System.out.println(matcher.group(1));
        }
    }

    public String getURLContent(String url, String encoding) {
        if (url == null || "".equals(url.trim()))
            return null;

        StringBuffer content = new StringBuffer();
        try {
            // 新建URL对象
            URL u = new URL(url);
            InputStream in = new BufferedInputStream(u.openStream());
            InputStreamReader theHTML = new InputStreamReader(in,
                    encoding != null ? encoding : "gb2312");
            int c;
            while ((c = theHTML.read()) != -1) {
                content.append((char) c);
            }
        }
        // 处理异常
        catch (MalformedURLException e) {
            System.err.println(e);
        } catch (IOException e) {
            System.err.println(e);
        }
        return content.toString();
    }

}



------解决方案--------------------
Java code

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class ProductItem {
    String name;
    String picture;
    String price;

    public String getPrice() {
    return price;
    }

    public void setPrice(String price) {
    this.price = price;
    }

    public String getName() {
    return name;
    }

    public void setName(String name) {
    this.name = name;
    }

    public String getPicture() {
    return picture;
    }

    public void setPicture(String picture) {
    this.picture = picture;
    }

    public ProductItem(String name, String picture, String price) {
    super();
    this.name = name;
    this.picture = picture;
    this.price = price;
    }

    public static ProductItem createItem(String urlString) {
    String name = null, picture = null, price = null;
    String content = getURLContent(urlString, "gb2312");
    String regStr = "<h1>(.*?)</h1>";// 产品名称
    Pattern pattern = Pattern.compile(regStr);
    Matcher matcher = pattern.matcher(content);
    if (matcher.find())
        name = matcher.group(1);

    regStr = "src=\"(.*?b\\.jpg)\"";// 产品图片
    pattern = Pattern.compile(regStr);
    matcher = pattern.matcher(content);
    if (matcher.find())
        picture = matcher.group(1);

    regStr = "class=\"num\".*?(\\d+\\.\\d+)";// 价格
    pattern = Pattern.compile(regStr);
    matcher = pattern.matcher(content);
    if (matcher.find())
        price = matcher.group(1);

    return new ProductItem(name, picture, price);
    }

    public static ProductItem getItem(String urlString) {
    String name = null, picture = null, price = null;
    String content = getURLContent(urlString, "gb2312");
    String regStr = "<h1>(.*?)</h1>.*?src=\"(.*?b\\.jpg)\".*?num\".*?(\\d+\\.\\d+).*";// 产品名称
    Pattern pattern = Pattern.compile(regStr, Pattern.DOTALL);
    Matcher matcher = pattern.matcher(content);
    while (matcher.find()) {
        name = matcher.group(1);
        picture = matcher.group(2);
        price = matcher.group(3);

    }

    return new ProductItem(name, picture, price);
    }

    public static String getURLContent(String urlString, String encoding) {
    if (urlString == null || "".equals(urlString.trim()))
        return null;

    StringBuffer content = new StringBuffer();
    try {
        // 新建URL对象
        URL url = new URL(urlString);
        InputStream in = new BufferedInputStream(url.openStream());
        InputStreamReader theHTML = new InputStreamReader(in,
            encoding != null ? encoding : "gb2312");
        int c;
        while ((c = theHTML.read()) != -1) {
        content.append((char) c);
        }
    }
    // 处理异常
    catch (MalformedURLException e) {
        System.err.println(e);
    } catch (IOException e) {
        System.err.println(e);
    }
    return content.toString();
    }

    public String toString() {
    return "name = " + name + "  \npicture = " + picture + "  \nprice = "
        + price;
    }

}

public class dsfdsf {
    public static void main(String[] args) {
    String url = "http://product.dangdang.com/product.aspx?product_id=20689512";

    ProductItem productItem = ProductItem.createItem(url);
    System.out.println(productItem);
    productItem = ProductItem.getItem(url);
    System.out.println(productItem);
    }

}