Saturday, December 17, 2011

Convert Mac Pages to HTML in Powershell


# A very crude Mac Pages to HTML converter in Powershell
# Preserves no formatting save for line breaks.
# Useful to extract the text so you can paste into Word

# A .pages file is a Zip; extract it to get the index.xml file 
# and parse out the paragraphs. 
# Note there's also a preview.pdf in the zip if all you need is to print.

# Pass the .pages file on the command line
# .\Pages-ToHtml.ps1 '.\Path\To\File.pages' > Converted.html
#
# Requires PSCX
Import-Module PSCX
$file = (Read-Archive $args[0] -format Zip ) | Where-Object { $_.Path -ieq "index.xml" } | Expand-Archive -PassThru
$xml = [xml](Get-Content $file)
$layout = $xml.document."text-storage"."text-body".section.layout
# Stripped of all formatting
#$layout.InnerText
# Write paragraphs only
$layout.p | ForEach-Object {
    echo "<p>"

    $_."#text"
    echo 
"</p>"
}