CFLib.org – Common Function Library Project

removeHTML(source)

Last updated November 14, 2007

author

Scott Bennett

Version: 1 | Requires: CF5 | Library: StrLib

Description:
Removes All HTML from a string, Removing tags, script blocks, style blocks, Head blocks, and replaces common special character code with text equivalents.

Return Values:
Returns a string.

Example:

<cfsavecontent variable="htmlstring">
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
	<title>My Title should be removed</title>
</head>
<script language="JavaScript">
// Script block will be removed
function myfunction(){
	alert("this is my function");
}
</script>
<style type="text/css">
	body, p, td, li, form  { font-family: verdana, sans-serif; font-size: 13px; color: #000000;
							margin-top: 0px }
</style>
<body>
<div align="center">
<!-- HtML comments should be removed-->
<p>This is the body of my page it should show up fine. </p><br>
<b><i>This sentance should not be bold or italic</i></b> These special characters should be replaced
<ul>
	<li>&nbsp;</li>
	<li>&bull;</li>
	<li>&lsaquo;</li>
	<li>&rsaquo;</li>
	<li>&trade;</li>
	<li>&frasl;</li>
	<li>&lt;</li>
	<li>&gt;</li>
	<li>&copy;</li>
	<li>&copy;</li>
	<li>&reg;</li>
</ul>
<table><tr><td>There should be no table here</td></tr></table>
</div>
</body>
</html>
</cfsavecontent>
<cfoutput>#RemoveHTML(htmlString)#</cfoutput>

Parameters:

Name Description Required
source String to format. Yes

Full UDF Source:

/**
 * Removes All HTML from a string removing tags, script blocks, style blocks, and replacing special character code.
 * 
 * @param source 	 String to format. (Required)
 * @return Returns a string. 
 * @author Scott Bennett (scott@coldfusionguy.com) 
 * @version 1, November 14, 2007 
 */
function removeHTML(source){
	
	// Remove all spaces becuase browsers ignore them
	var result = ReReplace(trim(source), "[[:space:]]{2,}", " ","ALL");
	
	// Remove the header
	result = ReReplace(result, "<[[:space:]]*head.*?>.*?</head>","", "ALL");
	
	// remove all scripts
	result = ReReplace(result, "<[[:space:]]*script.*?>.*?</script>","", "ALL");
	
	// remove all styles
	result = ReReplace(result, "<[[:space:]]*style.*?>.*?</style>","", "ALL");
	
	// insert tabs in spaces of <td> tags
	result = ReReplace(result, "<[[:space:]]*td.*?>","	", "ALL");
	
	// insert line breaks in places of <BR> and <LI> tags
	result = ReReplace(result, "<[[:space:]]*br[[:space:]]*>",chr(13), "ALL");
	result = ReReplace(result, "<[[:space:]]*li[[:space:]]*>",chr(13), "ALL");
	
	// insert line paragraphs (double line breaks) in place
	// if <P>, <DIV> and <TR> tags
	result = ReReplace(result, "<[[:space:]]*div.*?>",chr(13), "ALL");
	result = ReReplace(result, "<[[:space:]]*tr.*?>",chr(13), "ALL");
	result = ReReplace(result, "<[[:space:]]*p.*?>",chr(13), "ALL");
	
	// Remove remaining tags like <a>, links, images,
	// comments etc - anything thats enclosed inside < >
	result = ReReplace(result, "<.*?>","", "ALL");
	
	// replace special characters:
	result = ReReplace(result, "&nbsp;"," ", "ALL");
	result = ReReplace(result, "&bull;"," * ", "ALL");    
	result = ReReplace(result, "&lsaquo;","<", "ALL");        
	result = ReReplace(result, "&rsaquo;",">", "ALL");        
	result = ReReplace(result, "&trade;","(tm)", "ALL");        
	result = ReReplace(result, "&frasl;","/", "ALL");        
	result = ReReplace(result, "&lt;","<", "ALL");        
	result = ReReplace(result, "&gt;",">", "ALL");        
	result = ReReplace(result, "&copy;","(c)", "ALL");        
	result = ReReplace(result, "&reg;","(r)", "ALL");    
	
	// Remove all others. More special character conversions
	// can be added above if needed
	result = ReReplace(result, "&(.{2,6});", "", "ALL");    
	
	// Thats it.
	return result;

}
blog comments powered by Disqus

Search CFLib.org


Latest Additions

Kevin Cotton added
date2ExcelDate
May 5, 2016

Raymond Camden added
CapFirst
April 25, 2016

Chris Wigginton added
loremIpsum
January 18, 2016

Gary Stanton added
calculateArrival...
November 19, 2015

Sebastiaan Naafs - van Dijk added
getDaysInQuarter
November 13, 2015

Created by Raymond Camden / Design by Justin Johnson