指引网

当前位置: 主页 > 编程开发 > .NET >

提取HTML代码中文字的C#函数-资讯动态

来源:网络 作者:佚名 点击: 时间:2017-11-17 03:42
[摘要] 方法1: ///提取HTML代码中文字的C#函数 /// summary /// 去除HTML标记 /// /summary /// param name="strHtml"包括HTML的源码 /param /// returns已经去除后的文字/returns using System; using System.Text.Regul

        方法1:

 

        ///提取HTML代码中文字的C#函数
        ///   <summary>
        ///   去除HTML标记
        ///   </summary>
        ///   <param   name="strHtml">包括HTML的源码   </param>
        ///   <returns>已经去除后的文字</returns>
        using   System;
        using   System.Text.RegularExpressions;
        public   class   StripHTMLTest{
              public   static   void   Main(){
                  string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
                  Console.WriteLine(s);
              }
              public   static   string   StripHTML(string   strHtml){
                  string   []   aryReg   ={
                              @"<script[^>]*?>.*?</script>",
                              @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
                              @"([\r\n])[\s]+",
                              @"&(quot|#34);",
                              @"&(amp|#38);",
                              @"&(lt|#60);",
                              @"&(gt|#62);",
                              @"&(nbsp|#160);",
                              @"&(iexcl|#161);",
                              @"&(cent|#162);",
                              @"&(pound|#163);",
                              @"&(copy|#169);",
                              @"&#(\d+);",
                              @"-->",
                              @"<!--.*\n"
                            };
                  string   []   aryRep   =   {
                                "",
                                "",
                                "",
                                "\"",
                                "&",
                                "<",
                                ">",
                                "   ",
                                "\xa1",//chr(161),
                                "\xa2",//chr(162),
                                "\xa3",//chr(163),
                                "\xa9",//chr(169),
                                "",
                                "\r\n",
                                ""
                              };

                  string   newReg   =aryReg[0];
                  string   strOutput=strHtml;
                  for(int   i   =   0;i<aryReg.Length;i++){
                      Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);
                      strOutput   =   regex.Replace(strOutput,aryRep[i]);
                  }
                  strOutput.Replace("<","");
                  strOutput.Replace(">","");
                  strOutput.Replace("\r\n","");
                  return   strOutput;
              }
          }

------分隔线----------------------------