DocumentSummary.Rmd

---
jupyter:
  jupytext:
    text_representation:
      extension: .Rmd
      format_name: rmarkdown
      format_version: '1.2'
      jupytext_version: 1.11.2
  kernelspec:
    display_name: Python (py38)
    language: python
    name: py38
---

```{python}
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import wikipedia
import en_core_web_sm
 
# Get wiki content.
wikisearch = wikipedia.page("Amitabh Bachchan")
wikicontent = wikisearch.content
nlp = en_core_web_sm.load()
doc = nlp(wikicontent)
 
# Save the wiki content to a file
# (for reference).
f = open("wikicontent.txt", "w")
f.write(wikicontent)
f.close()
 
# Summary (0.5% of the original content).
summ_per = summarize(wikicontent, ratio = 0.05)
print("Percent summary")
print(summ_per)
 
# Summary (200 words)
summ_words = summarize(wikicontent, word_count = 200)
print("Word count summary")
print(summ_words)
```

```{python}
from gensim.summarization import summarize
import requests
from bs4 import BeautifulSoup
```

```{python}
url = 'https://towardsdatascience.com/text-summarization-in-python-76c0a41f0dc4'
page = requests.get(url).text
```

```{python}
soup = BeautifulSoup(page)
```

```{python}
bodies = soup.findAll("div", {"class": "af hx ac dl w x"})
print(bodies)
```

```{python}
print(len(bodies))
```

```{python}
idx = 0
title = bodies[idx].find("h1").text
print(title)
```

```{python}
texts = []
p_tags = bodies[idx].find_all('p')
for p in p_tags:
    texts.append(p.text)
    
article = ' '.join(texts)
```

```{python}
summary = summarize(article, ratio=0.3)
print(summary)
```

```{python}
article = """The current accounting system version has been identified as needing to migrate to a future software:
-	Version is on premises, and Sun is moving all future development to Cloud
-	Sun Sustainment project in 2018 identified a number of constraints / opportunities and recommend that a version upgrade at minimum.
The scope of this project is to Plan the migration of the accounting system from current sun version to a future software.  The deliverable in 2020 will be a migration plan, including the following elements;
-	When the migration should occur
-	A recommendation as to which software should the migration be to.  Note this will probably involve a light system selection project, but should not involve a detailed RFP 
-	Identify any opportunities to be considered in the migration beyond the core accounting system (i.e. opportunities to not just replace like for like), such as Procurement and CPM/reporting.
-	Time effort and money estimates to do the migration
The Migration Plan should be completed by the end of April 2020.
"""
summary = summarize(article, ratio=0.3)
print(summary)
```

```{python}
try:
    import docx
except:
    # %pip install docx
```

```{python}
import docx

```

```{python}
def get_fileinfo(filename):
    time_format = '%Y-%m-%d %H:%M:%S'
    try:
        file_stats = os.stat(filename)
        mod_time = time.strftime(time_format, time.localtime(file_stats[stat.ST_MTIME]))
        acc_time = time.strftime(time_format, time.localtime(file_stats[stat.ST_ATIME]))
        file_size = file_stats[stat.ST_SIZE]
        
    except Exception as e:
        logger.info("ERROR: fileinfo {}".format(e))
        mod_time, acc_time, file_size = ["", "", 0]
        
    return mod_time, acc_time, file_size
        
def scanfiles(folder, filter = None):
    for dirpath, _, filenames in os.walk(folder):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            logger.debug(filename)

            m_data = None
            if filter is not None:
                if not isinstance(filter, re.Pattern):
                    filter = re.compile(filter)
                m = re.search(filter, filename)
                if not m:
                    logger.debug("Skipping: {}".format(filename))
                    continue

                m_data = [m.groupdict() for m in filter.finditer(filename)]

            try:
                mTime, aTime, fSize = get_fileinfo(filepath)
                
                data= {
                    'folder': dirpath,
                    'file': filename,
                    'modified': mTime,
                    'accessed': aTime,
                    'size': fSize
                }
                if m_data:
                    data['matches'] = m_data

                yield data
            except Exception as e:
                logger.info("ERROR: scan files failed {}".format(e))
```

```{python}
import os
import stat
from datetime import datetime
import logging
import time
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

paths = ['H:', 'IT', 'Budgets', 'Capital', '2020', 'Raw KCP Submissions']
root_path = os.path.join(*paths)
print(root_path)

files = list()
for f in scanfiles(root_path):
    files.append(f)
```

```{python}
import pandas as pd


df_files = pd.DataFrame(files)
display(df_files.head())
```

```{python}
import sys
print(sys.version)
print(sys.version_info)
```

```{python}
from openpyxl import load_workbook, Workbook
from openpyxl.utils import column_index_from_string
from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font, Color

def load_WB(path):    
    # process each sheet...
    # assume top row is the titles
    # subsequent rows are data...
    if os.path.exists(path):
        wb = load_workbook(path)
        print('Loaded ... {}'.format(path))
    else:
        print('Creating new workbook...')
        wb = Workbook()
        try:
            wb.save(path)
        except:
            pass

    return wb
```

```{python}
wb_path = 'KCPList.xlsx'

with pd.ExcelWriter(wb_path) as f:
    df_files.to_excel(f, sheet_name='files', index=False)
```

```{python}

```

```{python}

```