68 lines
3 KiB
Python
68 lines
3 KiB
Python
"""
|
|
MIT License
|
|
|
|
Copyright (c) Jan Beilicke <dev@jotbe.io>
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
|
|
Code written with the assistance of ChatGPT.
|
|
"""
|
|
|
|
import xml.etree.ElementTree as ET
|
|
import re
|
|
import argparse
|
|
from urllib.parse import unquote
|
|
|
|
def find_mismatches(file_path, output_file):
|
|
# Parse the XML file
|
|
tree = ET.parse(file_path)
|
|
root = tree.getroot()
|
|
|
|
# Define a regular expression pattern to extract links from HTML
|
|
link_pattern = re.compile(r'<a\s+.*?href=["\'](.*?)["\'].*?>(.*?)</a>', re.DOTALL)
|
|
|
|
# Open the output file for writing
|
|
with open(output_file, 'w') as result_file:
|
|
# Find and process 'content.encoded' nodes
|
|
for item in root.findall(".//item"): # Adjust the path based on your XML structure
|
|
content_encoded = item.find("./content:encoded", namespaces={'content': 'http://purl.org/rss/1.0/modules/content/'})
|
|
|
|
if content_encoded is not None and content_encoded.text:
|
|
html_content = content_encoded.text
|
|
matches = link_pattern.findall(html_content)
|
|
|
|
# Check for mismatches, ignoring trailing slashes
|
|
for href, link_text in matches:
|
|
href_stripped = href.rstrip('/')
|
|
link_text_stripped = link_text.rstrip('/')
|
|
|
|
if href.startswith('http') and unquote(href_stripped) != unquote(link_text_stripped):
|
|
result_file.write(f"Mismatch found in item with title '{item.find('title').text}':\n")
|
|
result_file.write(f" Link Text: {link_text}\n")
|
|
result_file.write(f" Href: {href}\n")
|
|
result_file.write("\n")
|
|
|
|
if __name__ == "__main__":
|
|
output_file_path = "feed-check-result.txt"
|
|
|
|
parser = argparse.ArgumentParser(description="Check an XML file for mismatches between link text and href attributes.")
|
|
parser.add_argument("xml_file_path", help="Path to the XML file to be checked")
|
|
args = parser.parse_args()
|
|
|
|
find_mismatches(args.xml_file_path, output_file_path)
|