用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本

来源:岁月联盟 编辑:exp 时间:2012-04-26

[csharp]
/// <summary> 
/// DOM查询器,用法跟jquery差不多 
/// </summary> 
public class DomQuery 

    /// <summary> 
    /// 获得节点 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    /// <remarks>DOM选择器,用法跟jquery差不多</remarks> 
    public IList<HtmlNode> Get(HtmlDocument _HtmlDocument, string selector) 
    { 
        string[] Expressions = selector.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); 
 
        List<HtmlNode> hnList = new List<HtmlNode>(); 
 
        if (Expressions[0].StartsWith("#")) 
        { 
            hnList.Add(_HtmlDocument.GetElementbyId(Expressions[0].TrimStart('#'))); 
            hnList.RemoveAll(x => { return x == null; }); 
 
            if (Expressions.Length == 1) 
            { 
                return hnList; 
            } 
 
            for (int i = 1; i < Expressions.Length; i++) 
            { 
                hnList = Get(hnList, Expressions[i]); 
            } 
        } 
        else 
        { 
            hnList.AddRange(_HtmlDocument.DocumentNode.ChildNodes.Where(x => { return x.NodeType == HtmlNodeType.Element; })); 
 
            for (int i = 0; i < Expressions.Length; i++) 
            { 
                hnList = Get(hnList, Expressions[i]); 
            } 
        } 
 
 
 
 
 
        return hnList; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回InnerHtml 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string SingleGetInnerHtml(HtmlDocument _HtmlDocument, string selector) 
    { 
        HtmlNode hn = SingleGet(_HtmlDocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.InnerHtml; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回InnerText 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string SingleGetInnerText(HtmlDocument _HtmlDocument, string selector) 
    { 
        HtmlNode hn = SingleGet(_HtmlDocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.InnerText.Trim(); 
    } 
    /// <summary> 
    /// 查找节点 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public HtmlNode SingleGet(HtmlDocument _HtmlDocument, string selector) 
    { 
        IList<HtmlNode> hnList = Get(_HtmlDocument, selector); 
 
        if (hnList.Count == 0) 
        { 
            return null; 
        } 
        else 
        { 
            return hnList[0]; 
        } 
    } 
 
    #region 获得属性 
    /// <summary> 
    /// 获得属性 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <param name="attr"></param> 
    /// <returns></returns> 
    public string[] Attr(IList<HtmlNode> _HtmlNodes, string attr) 
    { 
        if (_HtmlNodes == null) 
        { 
            return new string[0]; 
        } 
        if (_HtmlNodes.Count() == 0) 
        { 
            return new string[0]; 
        } 
        var v = from x in _HtmlNodes where x.Attributes[attr] != null select x; 
 
        return (from x in v select x.Attributes[attr].Value).ToArray(); 
    } 
    #endregion 
 
    #region 根据选择器语法查找 
    /// <summary> 
    /// 根据选择器语法查找 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <param name="Expression"></param> 
    /// <returns></returns> 
    private List<HtmlNode> Get(List<HtmlNode> _HtmlNodes, string Expression) 
    { 
        string _expre = null; 
        string fun = null; 
        int index = -1; 
        string keyword = null; 
        Regex reg = new Regex(@"([.|/-|/w]+)", RegexOptions.Singleline); 
        MatchCollection mc = reg.Matches(Expression); 
        for (int i = 0; i < mc.Count; i++) 
        { 
            if (i == 0) 
            { 
                _expre = mc[i].Value; 
            } 
            if (i == 1) 
            { 
                fun = mc[i].Value; 
            } 
            if (i == 2) 
            { 
                if (int.TryParse(mc[i].Value, out index) == false) 
                { 
                    keyword = mc[i].Value; 
                } 
            } 
        } 
        List<HtmlNode> list = new List<HtmlNode>(); 
 
        if (string.IsNullOrEmpty(fun) == true) 
        { 
            if (Expression.StartsWith(".")) 
            { 
                return Class(_HtmlNodes, Expression).ToList(); 
            } 
            else 
            { 
                return NodeType(_HtmlNodes, Expression).ToList(); 
            } 
        } 
        else 
        { 
            foreach (var n in _HtmlNodes) 
            { 
                IEnumerable<HtmlNode> v; 
                if (_expre.StartsWith(".")) 
                { 
                    v = Class(n, _expre); 
                } 
                else 
                { 
                    v = NodeType(n, _expre); 
                } 
 
 
                list.AddRange(FunAction(v, fun, index, keyword)); 
            } 
            return list; 
        } 
    } 
    #region 函数处理 
    /// <summary> 
    /// 函数处理 www.2cto.com  
    /// </summary> 
    /// <param name="v"></param> 
    /// <param name="fun"></param> 
    /// <returns></returns> 
    private IEnumerable<HtmlNode> FunAction(IEnumerable<HtmlNode> v, string fun, int index, string keyword) 
    { 
        switch (fun.ToLower()) 
        { 
            case "eq": 
                return v.Where((nn, _index) => _index == index); 
            case "lt": 
                return v.Where((nn, _index) => _index < index); 
            case "gt": 
                return v.Where((nn, _index) => _index > index); 
            case "first": 
                if (v.Count() > 0) 
                    return new HtmlNode[] { v.First() }; 
                else 
                    return v; 
            case "last": 
                if (v.Count() > 0) 
                    return new HtmlNode[] { v.Last() }; 
                else 
                    return v; 
            case "even": 
                return v.Where((nn, _index) => _index % 2 == 0); 
            case "odd": 
                return v.Where((nn, _index) => (_index & 1) == 1); 
            case "next": 
                return v.Select(nn => nn.NextSibling); 
            case "contains": 
                return v.Where(x => { return x.InnerHtml.Contains(keyword); }); 
            case "empty": 
                return v.Where(x => { return x.HasChildNodes == false; }); 
            case "header": 
                string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" }; 
                return FindChildNodes(v.ToArray()).Where(x => { return headers.Contains(x.OriginalName); }); 
            default: 
                throw new NotSupportedException("函数不支持。"); 
        } 
    } 
    #endregion 
    #endregion 
 
    #region 根据类名找节点 
    private ParallelQuery<HtmlNode> Class(HtmlNode hn, string Expression) 
    { 
        return Class(new HtmlNode[] { hn }, Expression); 
    } 
    /// <summary> 
    /// 根据类名找节点 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <param name="Expression"></param> 
    /// <returns></returns> 
    private ParallelQuery<HtmlNode> Class(IList<HtmlNode> _HtmlNodes, string Expression) 
    { 
        var v = FindChildNodes(_HtmlNodes).AsParallel().Where(x => x.Attributes["class"] != null); 
 
        var Y = v.Where(x => x.Attributes["class"].Value.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Contains(Expression.TrimStart('.'), StringComparer.CurrentCultureIgnoreCase)); 
 
        return Y; 
    } 
    #endregion 
 
    #region 根据类型找节点 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="hn"></param> 
    /// <param name="Expression"></param> 
    /// <returns></returns> 
    private ParallelQuery<HtmlNode> NodeType(HtmlNode hn, string Expression) 
    { 
        return NodeType(new HtmlNode[] { hn }, Expression); 
    } 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <param name="Expression"></param> 
    /// <returns></returns> 
    private ParallelQuery<HtmlNode> NodeType(IList<HtmlNode> _HtmlNodes, string Expression) 
    { 
        var v = FindChildNodes(_HtmlNodes).AsParallel().Where( 
                 x => x.OriginalName.Equals(Expression, StringComparison.CurrentCultureIgnoreCase)); 
 
 
        return v; 
    } 
    #endregion 
 
    #region 查找所有下级 
    /// <summary> 
    /// 查找所有下级 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <returns></returns> 
    private List<HtmlNode> FindChildNodes(IList<HtmlNode> _HtmlNodes) 
    { 
        if (_HtmlNodes == null) 
        { 
            throw new Exception(""); 
        } 
        List<HtmlNode> list = new List<HtmlNode>(); 
        foreach (var v in _HtmlNodes) 
        { 
            FindChildNodesAction(v, list); 
        } 
 
        return list; 
    } 
    private void FindChildNodesAction(HtmlNode hn, List<HtmlNode> list) 
    { 
        if (list == null) 
        { 
            throw new Exception(""); 
        } 
        foreach (var v in hn.ChildNodes) 
        { 
            if (hn.NodeType == HtmlNodeType.Element) 
            { 
                list.Add(v); 
                FindChildNodesAction(v, list); 
            } 
        } 
    } 
 
    #endregion 
 
 

 

摘自 winner2050的专栏