tools/c2m.sh

   1 #!/bin/bash
   2
   3 set -x # uncomment for bash script debugging
   4 echo "c2m -------------------------------------------------------------"
   5 ### ============================================================================
   6 ### Licensed under the Apache License, Version 2.0 (the "License");
   7 ### you may not use this file except in compliance with the License.
   8 ### You may obtain a copy of the License at
   9 ###
  10 ###       http://www.apache.org/licenses/LICENSE-2.0
  11 ###
  12 ### Unless required by applicable law or agreed to in writing, software
  13 ### distributed under the License is distributed on an "AS IS" BASIS,
  14 ### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 ### See the License for the specific language governing permissions and
  16 ### limitations under the License.
  17 ### ============LICENSE_END=====================================================
  18
  19
  20 ###
  21 ### c2m
  22 ###
  23 ### AUTHOR(S):
  24 ### Thomas Kulik, Deutsche Telekom AG, 2020
  25 ###
  26 ### DESCRIPTION:
  27 ### c2m automates additional tasks required in case you want to export and
  28 ### convert a set of wiki pages. the export and first conversion to markdown is
  29 ### done by confluence2md, provided by viaboxx.
  30 ### c2m processes a list of (to be exported) wiki pages, creates corresponding
  31 ### export directories, exports and converts pages (in various formats if
  32 ### required), opens an editor and cleans up afterwards.
  33 ### c2m checks also for problematic content in the export and creates a warning
  34 ### in case of detection.
  35 ###
  36 ### ISSUES:
  37 ### - markdown (md) output of confluence2md contains sometimes tags that are
  38 ###   somehow "merged" with the topic headline; manual edit is required here
  39 ###
  40 ### OPEN:
  41 ### - confluence2md does not support all of the currently used confluence page
  42 ###   types (structured-macros) - result for unsupported pages is
  43 ###   "not satisfying"; enhancements (java) are required
  44 ### - opt: toc creation in root document in case you export a tree of documents
  45 ###   to separate files
  46 ### - opt: remove wiki credentials from script
  47 ###
  48 ### REQUIRED:
  49 ### - pandoc, retext, confluence2md, java (older version for confluence2md),
  50 ###   login for the confluence wiki
  51 ###
  52 ### SEE ALSO:
  53 ### - https://www.viaboxx.de/code/confluence2md/
  54 ### - https://github.com/viaboxxsystems/confluence2md
  55 ###
  56
  57
  58 ###
  59 ### CHANGELOG (LATEST ON TOP)
  60 ###
  61 ### 1.2.0 (2021-08-02) Corrections to http/https proxy handling and support to
  62 ###                    get Confluence credentials from env variables instead of
  63 ###                    directly from the code.
  64 ### 1.1.0 (2020-03-10) added support for http/https proxy and anonymous wiki
  65 ###                    access. thx to eric, nicolas and sylvain (orange, france)
  66 ###                    confluence2md jar file now has to be in the same path as
  67 ###                    c2m.
  68 ### 1.0.0 (2020-03-09) initial release
  69 ###
  70
  71
  72 ###
  73 ### c2m example pagelist
  74 ###
  75 ### example pagelist (field descriptions below); it uses the delimiter "|" for
  76 ### the four fields per line.
  77 ### copy/paste page id and title from wiki; to get the wiki page_id you have to
  78 ### login to the wiki, open the page and choose e.g. the history.
  79 ### depth: use depth to follow down the child-pages hierarchy if required:
  80 ### -1=infinte, 0=no children, #=number of child-pages to follow.
  81 ### every hierarchy "0" entry will lead into the creation of a dedicated working
  82 ### directory where the page and child-pages are stored.
  83 ### for better readability you can add spaces to the list, but use "|" as a
  84 ### delimiter. lines starting with a # are filtered by c2m.
  85 ###
  86 ### hierarchy | page_id  | page_title                      | depth
  87 ###
  88 ### 0         |  1018748 | ONAP Portal                     |  0
  89 ### 1.1       |  1018759 | ONAP Portal for users           |  0
  90 ### 1.2       |  1018762 | ONAP Portal for administrators  |  0
  91 ### 1.2.1     |  1018764 | Admins                          |  0
  92 ### 1.2.2     |  1018811 | Users                           |  0
  93 ### 1.2.3     |  1018821 | Portal Admins                   |  0
  94 ### 1.2.4     |  1018826 | Application Onboarding          |  0
  95 ### 1.2.5     |  1018832 | Widget Onboarding               |  0
  96 ### 1.2.6     |  1018835 | Edit Functional Menu            |  0
  97 ### 1.2.7     | 16004953 | Portal Microservices Onboarding |  0
  98 ###
  99 ### in case you want to export to only one single output page (that contains all
 100 ### child-pages of the above example) use:
 101 ###
 102 ### 0         |  1018748 | ONAP Portal                     | -1
 103 ###
 104
 105
 106 ###
 107 ### some initial variables
 108 ###
 109
 110 script_version="1.2.0 (2021-08-02)"
 111
 112 if [[ -z "$CONFLUENCE_USERNAME" || -z "$CONFLUENCE_PASSWORD" ]]
 113 then
 114     echo "Mandatory environment variables:"
 115     echo "  CONFLUENCE_USERNAME: Confluence username"
 116     echo "  CONFLUENCE_PASSWORD: Confluence password."
 117     echo "Be aware! Setting bash debuging on will print credentials."
 118     exit
 119 fi
 120
 121 user="${CONFLUENCE_USERNAME}";
 122 passwd="${CONFLUENCE_PASSWORD}";
 123 credentials="${user}":"${passwd}";
 124 server="https://wiki.onap.org";
 125 [ -z "$rst_editor" ] && rst_editor="retext --preview";
 126
 127 # remove credentials for those using anonymous access
 128 test "${credentials}" = "*****:*****" && credentials=""
 129
 130 # explicit script dir to locate jar file
 131 basedir="$(cd "$(dirname "$0")"; pwd)"
 132
 133 ###
 134 ### some inital tasks after script has been started
 135 ###
 136
 137 ###
 138 ### print script version, date and time
 139 ###
 140
 141 echo "INFO ***************************************************************************"
 142 echo "INFO c2m Version ${script_version}, started $(date)";
 143
 144 ###
 145 ### simple script argument handling
 146 ###
 147
 148 page_list=$1;
 149
 150 # check if there is an argument at all
 151 if [[ "$page_list" == "" ]] ; then
 152     echo 'Usage: c2m [PAGELIST]'
 153     exit 1
 154 fi
 155
 156 # check if argument is a file
 157 if [ ! -f $page_list ] ; then
 158     echo "Error: can't find pagelist \"$page_list\""
 159     exit 1
 160 fi
 161
 162 ###
 163 ### declare the functions of this script
 164 ###
 165
 166 ###
 167 ### function: create working directory; save (only the last) existing one; remove older versions; do some error handling
 168 ###
 169
 170 function create_working_dir {
 171
 172   base_dir="output"
 173   [ ! -d $base_dir ] && mkdir $base_dir
 174
 175   # compose name for working directory
 176   #working_dir="${page_id}-${page_title}";
 177   #working_dir="${page_title}-id${page_id}";
 178   working_dir="${base_dir}/${page_title}";
 179   echo "INFO ***************************************************************************"
 180   echo "INFO working directory \"$working_dir\" will be created"
 181
 182   # check if current working directory is already in the list
 183   if [[ " ${existing_working_dirs[@]} " =~ " ${working_dir} " ]]; then
 184     echo "ERRR ***************************************************************************"
 185     echo "ERRR working directory \"${working_dir}\" already exists - check entries in page_list for duplicates"
 186     echo "ERRR exiting ..."
 187     exit -1
 188   else
 189     # store working_dir name for error handling
 190     existing_working_dirs+=(${working_dir})
 191   fi
 192
 193   # sample code
 194   #if [[ ! " ${array[@]} " =~ " ${value} " ]]; then
 195   #    # whatever you want to do when arr doesn't contain value
 196   #fi
 197
 198   # check existence of working directory
 199   if [ -d "$working_dir" ]; then
 200     # check existence of old saved working directory
 201     if [ -d "${working_dir}.old" ]; then
 202       # remove the old saved working directory
 203       rm -r "${working_dir}.old";
 204     fi
 205     # save (only) the latest working directory
 206     mv $working_dir "$working_dir.old";
 207   fi
 208   # finally create the working directory and cd into it
 209   mkdir $working_dir;
 210   cd $working_dir;
 211 }
 212
 213 ###
 214 ### function: pull pages from wiki - currently we are testing some export variations
 215 ###
 216
 217 function pull_pages_from_wiki {
 218
 219   # define outfile name
 220   #out_file="${page_title}-id${page_id}";
 221   out_file="${page_title}";
 222
 223   # set proxy if needed
 224   if [[ -v http_proxy && ! -z "$http_proxy" ]]; then
 225     proxy_to_parse="${http_proxy/http:\/\//""}";
 226     echo "http_proxy is set to \"${proxy_to_parse}\"";
 227   elif [[ -v https_proxy && ! -z "$https_proxy" ]]; then
 228     proxy_to_parse="${https_proxy/https:\/\//""}";
 229     echo "https_proxy is set to \"${proxy_to_parse}\"";
 230   fi
 231
 232   #java_options="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.util=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED --add-opens java.base/java.lang.annotation=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED"
 233
 234   if [[ $proxy_to_parse =~ ^([\.0-9]+) ]]; then
 235     java_options="${java_options} -Dhttps.proxyHost=${BASH_REMATCH[1]} -Dhttp.proxyHost=${BASH_REMATCH[1]}"
 236     echo "${java_options}"
 237   fi
 238   if [[ $proxy_to_parse =~ .*:([0-9]+) ]]; then
 239     java_options="${java_options} -Dhttps.proxyPort=${BASH_REMATCH[1]} -Dhttp.proxyPort=${BASH_REMATCH[1]}"
 240     echo "${java_options}"
 241   fi
 242
 243   # TODO: -depth
 244   # pull pages from wiki and convert to markdown (as a source for conversion by pandoc)
 245   java $java_options -jar $basedir/confluence2md-2.1-fat.jar +H true +T false +RootPageTitle false +FootNotes true -maxHeaderDepth 7 -depth $depth -v true -o ${out_file}.md -u "${credentials}" -server $server $page_id
 246 }
 247
 248 ###
 249 ### function: simple search and (red colored) warning if special terms are detected in the md output file
 250 ###
 251
 252 function detect_unwanted_content_in_md_outfile {
 253 for search_term in "ecomp" "wiki.onap.com" "10.53.199.7" "at&t"
 254 do
 255   if grep $search_term ${out_file}.md; then
 256     echo -e "\e[31mWARN ***************************************************************************\e[39m";
 257     echo -e "\e[31mWARN term \"${search_term}\" detected in ${out_file}.md\e[39m";
 258   fi
 259 done
 260 }
 261
 262 ###
 263 ### function: pandoc conversion from md (variants) to rst - currenty testing some conversion formats
 264 ###
 265
 266 function convert_md_outfile_to_rst {
 267   #depending on the given source format (--from) the results may vary
 268   #pandoc -s --toc --toc-depth=5 --from markdown_mmd      --to rst "${out_file}.md" -o "${out_file}-markdown_mmd.rst"
 269   #pandoc -s --toc --toc-depth=5 --from markdown_strict   --to rst "${out_file}.md" -o "${out_file}-markdown_strict.rst"
 270   #pandoc -s --toc --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
 271   #pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
 272   pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}.rst"
 273 }
 274
 275 ###
 276 ### function: check results in rst editor
 277 ###
 278
 279 function open_rst_editor {
 280   #echo "DBUG ***************************************************************************"
 281   #echo "DBUG open \"${out_file}\*.rst\" with rst editor"
 282   $rst_editor ${out_file}*.rst &
 283 }
 284
 285 ###
 286 ### function: clean up export directories from files no longer needed
 287 ###
 288
 289 function clean_up {
 290   rm *.md                2>/dev/null
 291   rm attachments/*.json  2>/dev/null
 292   rm attachments/.*.json 2>/dev/null
 293 }
 294
 295 ###
 296 ### main: let's start the work ...
 297 ###
 298
 299 # read in pagelist file, filter lines starting with a comment and create an array that contains all (uncommented) lines of the file
 300
 301 # sample code
 302 # IFS=',' read -r -a page_array <<< "$page_list" # in case $page_list was defined as a varable in this script; use "," as the delimiter
 303 #readarray -t page_array < $page_list; # old version
 304
 305 readarray -t page_array < <(grep -v "^#" $page_list); # new version which skips line with comments
 306
 307 # INFO: show list of pages by printing every line of the array
 308 echo "INFO ***************************************************************************"
 309 for line in "${page_array[@]}"
 310 do
 311     echo "INFO $line"
 312 done
 313
 314 # the main loop reads the page_array line by line and processes the content
 315 for line in "${page_array[@]}"
 316 do
 317     echo "INFO - bupp $line"
 318     # cut out values from the current line (delimiter is now the "|") and assign them to the correct variables
 319     hierarchy=$(echo $line | cut -f1 -d\|)
 320       page_id=$(echo $line | cut -f2 -d\|)
 321    page_title=$(echo $line | cut -f3 -d\|)
 322         depth=$(echo $line | cut -f4 -d\|)
 323
 324     # remove leading and trailing spaces from variables
 325     hierarchy="$(echo -e "${hierarchy}"  | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
 326       page_id="$(echo -e "${page_id}"    | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
 327    page_title="$(echo -e "${page_title}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
 328         depth="$(echo -e "${depth}"      | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
 329
 330     # substitude all blanks in page_title with a minus sign
 331     page_title=$(echo -e ${page_title} | tr '[:blank:]' '-');
 332     echo "DBUG page_title=\"$page_title\""
 333
 334     # convert page_title to lowercase
 335     page_title=$(echo -e ${page_title} | tr '[:upper:]' '[:lower:]');
 336     #echo "DBUG page_title=\"$page_title\""
 337
 338     # remove all characters from page_title which may cause problems in the shell ... or are reserved by conventions of this script
 339     #page_title="$(echo -e "${page_title}" | sed -e 's/[^A-Za-z0-9._-]//g')"; # a less strict version
 340     page_title="$(echo -e "${page_title}" | sed -e 's/[^A-Za-z0-9-]//g')";
 341     echo "DBUG page_title=\"$page_title\""
 342
 343     # INFO: print variables to check content
 344     echo "INFO ***************************************************************************"
 345     echo "INFO hierarchy  = \"$hierarchy\""
 346     echo "INFO page_id    = \"$page_id\""
 347     echo "INFO page_title = \"$page_title\""
 348     echo "INFO depth      = \"$depth\""
 349
 350     # create working directory - done for every! "hierarchy 0" entry of page_list
 351     if [ "$hierarchy" == "0" ]
 352     then
 353       create_working_dir
 354     fi
 355
 356     # call functions to process page
 357     pull_pages_from_wiki
 358     detect_unwanted_content_in_md_outfile
 359     convert_md_outfile_to_rst
 360     open_rst_editor
 361     clean_up
 362
 363 # main loop end
 364 done
 365
 366 # bye!
 367 echo "INFO ***************************************************************************"
 368 echo "INFO c2m Version ${script_version}, ended $(date)"
 369 echo ""
 370 exit 0