1 This is a bash script to generate a full RSS feed from a directory containing simple webpages. The script works by pulling out the text between a few html tags to fill the needed information for a rss feed. It utilizes awk
and sed
to accomplish this. To use this script change www.example.com
with your domain name and /blog
for with your web page directory path.
2 The script generates most of the RSS feed by grabbing the text between certain html tags and plugging them into a xml file. The title
for each item in the .xml file is generated from taking what's between the h1
tags of the associated html file in the directory. Pubdate
for each item is generated by taking what's between the time
tags of the associated html file in the directory (dates must be in the YYYY-MM-DD
format in the html documents). Description
is generated from taking from what's between the article
tags of the associated html file in the directory.
3 Some information for script generation come from the file names of the html documents in the directory or can be generated without referring to the contents inside the html documents. The information for the link
tag for each item in the .xml file is generated from taking the associated html file name in the directory and adding a domain name and directory path.
#!/bin/bash
set -euo pipefail
IFS=$'\n\t'# RSS feed header
echo '<?xml version="1.0" encoding="UTF-8" ?>' > blog.xml
echo '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom>' >> blog.xml
echo '<channel>' >> blog.xml
echo '<title>RSS feed title</title>' >> blog.xml
echo '<link>https://www.example.com</link>' >> blog.xml
echo '<description>Example RSS feed </description>' >> blog.xml
echo '<language>en-us</language>' >> blog.xml
echo '<atom:link href="https://www.example.org/blog.xml" rel="self" type="application/rss+xml"/>' >> blog.xml# Directory containing HTML blog posts
posts_directory="/blog"# Parse HTML files in the directory
for filename in "$posts_directory"/*.html; do
if [ -f "$filename" ]; then
post_title=$(awk -F'<h1>|</h1>' '/<h1>/ {print $2; exit}' "$filename")
post_date=$(awk -F'<time>|</time>' '/<time>/ {print $2; exit}' "$filename")
post_content=$(sed -n '/<article>/,/<\/article>/p' "$filename" | sed '/^$/d' | tr -s ' ')
post_id=$(basename "$filename" .html)# Add the posts to the XML file
echo '<item>' >> blog.xml
echo "<title>$post_title</title>" >> blog.xml
echo "<link>https://www.example.com/blog/$post_id</link>" >> blog.xml
echo "<guid>https://www.example.com/blog/$post_id</guid>" >> blog.xml
echo "<pubDate>$post_date</pubDate>" >> blog.xml
echo "<description><![CDATA[$post_content]]></description>" >> blog.xml
echo '</item>' >> blog.xml
fi
done# Close the RSS feed
echo '</channel>' >> blog.xml
echo '</rss>' >> blog.xmlecho 'RSS blog feed generated successfully.'