Share Your Code

Strip HTML Tags from Text

Posted by ethanpil, Posted on December 21, 2011

There is no genius here on my part, I just started with Corona a few days ago, however, I found a lua snippet that removes HTML tags from code, leaving just the plain text, and its working great for me.

The original source for the code:
http://board.ptokax.ch/index.php?topic=7165.0

I made a slight change in the first few lines to make it work with the text that is passed in vs from a file. It has worked well for me.

Here it is:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
function HTML_ToText (text)
  -- Declare variables, load the file. Make tags lowercase.
   text = string.gsub (text,"(%b<>)",
  function (tag)
    return tag:lower()
  end)
  --[[ 
  First we kill the developer formatting (tabs, CR, LF)
  and produce a long string with no newlines and tabs.
  We also kill repeated spaces as browsers ignore them anyway.
  ]]
  local devkill=
    {
      ["("..string.char(10)..")"] = " ",
      ["("..string.char(13)..")"] = " ",
      ["("..string.char(15)..")"] = "",
      ["(%s%s+)"]=" ",
    }
  for pat, res in pairs (devkill) do
    text = string.gsub (text, pat, res)
  end
  -- Then we remove the header. We do this by stripping it first.
  text = string.gsub (text, "(<%s*head[^>]*>)", "<head>")
  text = string.gsub (text, "(<%s*%/%s*head%s*>)", "</head>")
  text = string.gsub (text, "(<head>,*<%/head>)", "")
  -- Kill all scripts. First we nuke their attribs.
  text = string.gsub (text, "(<%s*script[^>]*>)", "<script>")
  text = string.gsub (text, "(<%s*%/%s*script%s*>)", "</script>")
  text = string.gsub (text, "(<script>,*<%/script>)", "")
  -- Ok, same for styles.
  text = string.gsub (text, "(<%s*style[^>]*>)", "<style>")
  text = string.gsub (text, "(<%s*%/%s*style%s*>)", "</style>")
  text = string.gsub (text, "(<style>.*<%/style>)", "")
  
  -- Replace <td> with tabulators.
  text = string.gsub (text, "(<%s*td[^>]*>)","\t")
  
  -- Replace <br> with linebreaks.
  text = string.gsub (text, "(<%s*br%s*%/%s*>)","\n")
  
  -- Replace <li> with an asterisk surrounded by 2 spaces.
  -- Replace </li> with a newline.
  text = string.gsub (text, "(<%s*li%s*%s*>)"," *  ")
  text = string.gsub (text, "(<%s*/%s*li%s*%s*>)","\n")
  
  -- <p>, <div>, <tr>, <ul> will be replaced to a double newline.
    text = string.gsub (text, "(<%s*div[^>]*>)", "\n\n")
    text = string.gsub (text, "(<%s*p[^>]*>)", "\n\n")
    text = string.gsub (text, "(<%s*tr[^>]*>)", "\n\n")
    text = string.gsub (text, "(<%s*%/*%s*ul[^>]*>)", "\n\n")
  -- 
  
  -- Nuke all other tags now.
  text = string.gsub (text, "(%b<>)","")
  
  -- Replace entities to their correspondant stuff where applicable.
  -- C# is owned badly here by using a table. :-P
  -- A metatable secures entities, so you can add them natively as keys.
  -- Enclosing brackets also get added automatically (capture!)
  local entities = {}
  setmetatable (entities,
  {
    __newindex = function (tbl, key, value)
      key = string.gsub (key, "(%#)" , "%%#")
      key = string.gsub (key, "(%&)" , "%%&")
      key = string.gsub (key, "(%;)" , "%%;")
      key = string.gsub (key, "(.+)" , "("..key..")")
      rawset (tbl, key, value)
    end
  })
  entities = 
  {
    ["&nbsp;"] = " ",
    ["&bull;"] = " *  ",
    ["‹"] = "<",
    ["›"] = ">",
    ["&trade;"] = "(tm)",
    ["&frasl;"] = "/",
    ["<"] = "<",
    [">"] = ">",
    ["&copy;"] = "(c)",
    ["&reg;"] = "(r)",
    -- Then kill all others.
    -- You can customize this table if you would like to, 
    -- I just got bored of copypasting. :-)
    -- http://hotwired.lycos.com/webmonkey/reference/special_characters/
    ["%&.+%;"] = "",
  }
  for entity, repl in pairs (entities) do
    text = string.gsub (text, entity, repl)
  end
  
  return text
  
end


Replies

zoid66
User offline. Last seen 28 weeks 6 days ago. Offline
Joined: 2 Feb 2011

Could this work connecting to a website? Thinking more or less a using it in a music player app that reads song list from ShoutCast..

ethanpil
User offline. Last seen 45 weeks 1 day ago. Offline
Joined: 18 Dec 2011

This might work with xml, but I dont think it would be usable. The point here would be to clean up HTML code you might have downloaded. For example:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
require( "html2text" ) --function HTML_ToText
 
getURL = "http://developer.anscamobile.com/resources/docs"
 
local function urlDownload ( event )
print_r(event)
        if ( event.isError ) then
                print ( "Network error - download failed" )
        else
                        local filePath = system.pathForFile( "htmlDownload", system.TemporaryDirectory )
                         
                        -- io.open opens a file at path. returns nil if no file found
                        local fh, reason = io.open( filePath, "r" )
                         
                        if fh then
                           -- read all contents of file into a string
                           local htmlContent = fh:read( "*a" )
                           io.close( fh )
                           os.remove( filePath )
                           print( "Contents of " .. filePath .. ": \n" .. HTML_ToText(htmlContent) )    
                        end
                end
end
 
network.download( getURL, "GET", urlDownload, "htmlDownload", system.TemporaryDirectory )

horacebury
User offline. Last seen 3 hours 40 min ago. Offline
Joined: 17 Aug 2010

Hey @ethanpil, this is just what I was looking for! It makes a great addition to the XML library I posted:

http://developer.coronalabs.com/code/much-improved-dump-function-and-xml-simplify

Thank you :)