tools/getrsttitle.py

   1 #!/usr/bin/env python3
   2
   3 ### ===========================================================================
   4 ### Licensed under the Apache License, Version 2.0 (the "License");
   5 ### you may not use this file except in compliance with the License.
   6 ### You may obtain a copy of the License at
   7 ###
   8 ###       http://www.apache.org/licenses/LICENSE-2.0
   9 ###
  10 ### Unless required by applicable law or agreed to in writing, software
  11 ### distributed under the License is distributed on an "AS IS" BASIS,
  12 ### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 ### See the License for the specific language governing permissions and
  14 ### limitations under the License.
  15 ###
  16 ### Copyright (C) 2021 Deutsche Telekom AG
  17 ### ============LICENSE_END====================================================
  18
  19 #
  20 # getrsttitle.py
  21 # AUTHOR(S):
  22 # Thomas Kulik, Deutsche Telekom AG, 2021
  23 # DESCRIPTION:
  24 # Processes a list of rst files and retrieves the first title for every single rst file.
  25 # Copy program to {branch} directory of cloned ONAP documentation and run it.
  26 # USAGE:
  27 # python3 getrsttitle.py filename
  28 #
  29 # Helpful resources:
  30 # https://regex101.com/r/YNYK2Q/1/
  31 # https://stackoverflow.com/questions/20312443/how-to-find-title-a-la-restructuredtext
  32 #
  33
  34 import re
  35 import os.path
  36 import sys
  37 import argparse
  38
  39 #
  40 # argument handling
  41 #
  42
  43 parser = argparse.ArgumentParser(description='Processes a list of rst files and retrieves the first title for every single rst file.')
  44 parser.add_argument('filename')
  45 args = parser.parse_args()
  46
  47 # regex to find title underlined with various characters
  48 #regex1 = r"(?:^|\n)(?!\=)([^\n\r]+)\r?\n(\=+)(?:\r?\n| *$)"
  49 #regex2 = r"(?:^|\n)(?!\-)([^\n\r]+)\r?\n(\-+)(?:\r?\n| *$)"
  50 #regex3 = r"(?:^|\n)(?!\~)([^\n\r]+)\r?\n(\~+)(?:\r?\n| *$)"
  51 #regex4 = r"(?:^|\n)(?!\#)([^\n\r]+)\r?\n(\#+)(?:\r?\n| *$)"
  52 #regex5 = r"(?:^|\n)(?!\*)([^\n\r]+)\r?\n(\*+)(?:\r?\n| *$)"
  53
  54 # there is a problem with raw strings (r"...") in the regex search below
  55 # workaround: using \\ to mask special characters in regex
  56 regex_list = [
  57     "(?:^|\\n)(?!\\=)([^\\n\\r]+)\\r?\\n(\\=+)(?:\\r?\\n| *$)",
  58     "(?:^|\\n)(?!\\-)([^\\n\\r]+)\\r?\\n(\\-+)(?:\\r?\\n| *$)",
  59     "(?:^|\\n)(?!\\~)([^\\n\\r]+)\\r?\\n(\\~+)(?:\\r?\\n| *$)",
  60     "(?:^|\\n)(?!\\#)([^\\n\\r]+)\\r?\\n(\\#+)(?:\\r?\\n| *$)",
  61     "(?:^|\\n)(?!\\*)([^\\n\\r]+)\\r?\\n(\\*+)(?:\\r?\\n| *$)",
  62     ]
  63
  64 # DBUG only
  65 #for regex in regex_list:
  66 #    print(repr(regex))
  67
  68 #filename = './master_indexrst_docs_root.log'
  69 #filename = './master_rstfiles.log'
  70
  71 if os.path.isfile(args.filename):
  72     with open(args.filename) as fn:
  73         # read first line
  74         line = fn.readline()
  75         #print("DBUG: line={}".format(line))
  76         file_cnt = 0
  77         while line:
  78             rstfile         = "./" + re.sub('\[|\]', '', line).strip()
  79             repository_tmp1 = re.sub('\].+$', '',line).strip()
  80             repository      = re.sub('\[', '',repository_tmp1).strip()
  81             project_tmp1    = re.sub('\].+$', '',line).strip()
  82             project_tmp2    = re.sub('\/.+$', '',project_tmp1).strip()
  83             project         = re.sub('\[', '',project_tmp2).strip()
  84             #print("DBUG:       file #{} {}".format(file_cnt, rstfile))
  85             #print("DBUG: repository #{} {}".format(file_cnt, repository))
  86             #print("DBUG:    project #{} {}".format(file_cnt, project))
  87             file_cnt += 1
  88             if os.path.isfile(rstfile):
  89                 with open(rstfile, 'r') as content:
  90                     content_rstfile = content.read()
  91                     #print("DBUG: content_rstfile = \n{}".format(content_rstfile))
  92                     regex_cnt = 0
  93                     for regex in regex_list:
  94                         regex_cnt += 1
  95                         m = re.search(regex, content_rstfile, re.MULTILINE)
  96                         #print("DBUG: using regex  " + repr(regex))
  97                         #print("DBUG: using regex1 " + repr(regex1))
  98                         #print("DBUG: regex_cnt = {}".format(regex_cnt))
  99                         if m:
 100                             match = m.group(1)
 101                             #print ("DBUG: |REGEX| {} |REGEXCNT| {} |FILECNT| {} |FILE| {} |MATCH| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
 102                             # end regex loop if we have a title
 103                             break
 104                         else:
 105                             match = "NO-TITLE-FOUND"
 106                             #print ("DBUG: NO-TITLE-FOUND")
 107             else:
 108                 print ("ERR:  File {} does not exist".format(rstfile))
 109
 110             #print ("DBUG: |REGEX| {} |REGEXCNT| {} |FILECNT| {} |FILE| {} |MATCH| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
 111             #print ("DBUG: file #{} '{}' '{}'".format(file_cnt, rstfile, match))
 112
 113             # clean up result and print
 114             match_1 = match.replace(",", "") # remove ,
 115             match_final = match_1.strip()    # remove \n
 116             print ("{},{},{},{}".format(project.strip(), repository.strip(), line.strip(), match_final.strip()))
 117
 118             # read next line and loop
 119             line = fn.readline()
 120 else:
 121     print ("ERR:  File {} does not exist".format(args.filename))
 122
 123 sys.exit()
 124
 125 #
 126 # example code to show detailed regex matches and group content
 127 # to be used in a future version of this program
 128 #
 129 # matches = re.finditer(regex2, content, re.MULTILINE)
 130 # for matchNum, match in enumerate(matches, start=1):
 131 #     print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
 132 #     print ("{match}".format(match = match.group()))
 133 #     for groupNum in range(0, len(match.groups())):
 134 #         groupNum = groupNum + 1
 135 #         print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
 136 # print ("Test:" "{group}".format(group = match.group(1)))
 137 #
 138
 139 #
 140 # example code for pandas
 141 # to be used in a future version of this program
 142 #
 143 # import pandas as pd
 144 # pd.set_option('display.max_rows', 500)
 145 # pd.set_option('display.max_columns', 500)
 146 # pd.set_option('display.width', 1000)
 147 #
 148 # table = pd.read_csv("master_table.csv")
 149 # print(table)
 150 #