diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..1ba1998 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# Editor configuration, see https://editorconfig.org +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 2 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.md] +max_line_length = off diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..f34fd6a --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +# Rule definitions: http://flake8.pycqa.org/en/latest/user/error-codes.html +# D203: 1 blank line required before class docstring +# W503: line break before binary operator +# W504: line break after binary operator +# F401: file imported but not used +# F841: local variable is assigned to but never used +exclude = __pycache__,node_modules,.git,.pytest_cache,docs +ignore = D203,W503,W504,F401 +max-complexity = 24 +max-line-length = 120 +per-file-ignores = + codonPython/tests/file_utils_test.py:F841 \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index ba38699..0bb876a 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -8,7 +8,7 @@ assignees: '' --- **Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] +A clear and concise description of what the problem is. Eg. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen and why. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..6909a75 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,12 @@ +* **Please check if the Pull Request fulfills these requirements** +- [ ] The commit message is clear and concise +- [ ] Test for the changes have been added and reviewed +- [ ] Documentation has been added to all new features and edited code has had documentation reviewed + +* **What kind of change does this Pull request introduce?** (Bug fix, feature, docs update, ...) + +* **What is the current behaviour?** (Link to a current open issue if possible) + +* **What is the new behaviour?** (is this a feature change?) + +* **Does this Pull Request introduce a breaking change?** (What changes might users need to make in their application due to this PR?) \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2b5a74d..2f981f6 100644 --- a/.gitignore +++ b/.gitignore @@ -124,3 +124,6 @@ dmypy.json # Pyre type checker .pyre/ + +# Text editors +.vscode \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index b9dd3c0..efa8a4a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,24 @@ language: python + python: - 3.6 + install: + - sudo apt-get install unixodbc-dev - pip install -r requirements.txt - pip install codecov - pip install pytest pytest-cov + script: - - pytest --cov=./ + - pytest --cov=./ + after_success: - codecov + - cd docs && make html + +deploy: + provider: pages + skip_cleanup: true + github_token: $githubtoken + local_dir: docs/build/html + keep_history: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f3911ef..a2c4571 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,12 @@ # How to contribute -First off, thank you for taking the time to contribute! If you have a functionality that you would like to see in codon, we have a few standards and guidelines so we can merge your pull request quicker. +First off, thank you for taking the time to contribute! If you have a function that you would like to see in codon, we have a few standards and guidelines that we would like you to follow before we consider your merge request. Failure to follow the contribution guide will result in your merge request being challenged or rejected. + +We are looking for functions and/or classes which are useful for workflows in DIS specifically. Please do not submit the following... +* End-to-end scripts cannot be implemented into the package. If you find something reusable within your end-to-end script, then please feel free to extract it and submit it to Codon with tests attached. +* Multiple functions that are unrelated. For example, "This function takes an integer number and rounds it to the nearest whole number". Please do not include multiple functions unless they're methods of a class or are related to the same file (i.e. two methods of suppression) +* Duplicated functionality. For example, if your function is already done by another well known package. +* Irrelevant functionality. If the function you submit is unrelated to DIS, it will mostly likely be challenged or rejected. ## Basic idea @@ -8,18 +14,39 @@ First off, thank you for taking the time to contribute! If you have a functional 2. Write your documented function and tests (:heart_eyes:) on a new branch, coding in line with our **coding conventions**. -3. Submit a [pull request](https://help.github.com/en/articles/creating-a-pull-request) to codonPython with a clear description of what you have done. +3. Submit a [pull request](https://help.github.com/en/articles/creating-a-pull-request) **to the dev branch** of codonPython with a clear description of what you have done. We suggest you make sure all of your commits are atomic (one feature per commit). Please make sure that non-obvious lines of code are commented, and variable names are as clear as possible. Please do not send us undocumented code as we will not accept it. Including tests to your pull request will bring tears of joy to our eyes, and will also probably result in a faster merge. - ## Coding conventions -Start reading our code to get a feel for it: +We use the industry standard [PEP 8](https://www.python.org/dev/peps/pep-0008/) styling guide within the `codonPython` package. **Therefore, it’s imperative that you use the coding standards found within PEP 8 when creating or modifying any code within the `codonPython` package**. Autoformatters for PEP8, for instance [black](https://black.readthedocs.io/en/stable/), can easily ensure compliance. The reason we use PEP 8 coding standards is to make sure there is a layer of consistency across our codebase. This reduces the number of decisions that you need to make when styling your code, and also makes code easier to read when switching between functions etc. + +While you are creating code, we recommend that you understand the style guide standards for the following topics: + +* [Code layout](https://www.python.org/dev/peps/pep-0008/#code-lay-out) – Indentation, tabs or spaces, maximum line length, blank lines, source file encoding, imports & module level Dunder name +* [String quotes](https://www.python.org/dev/peps/pep-0008/#string-quotes) +* [Whitespace in expressions and statements](https://www.python.org/dev/peps/pep-0008/#whitespace-in-expressions-and-statements) – Pet Peeves, alternative recommendations +* [When to use trailing commas](https://www.python.org/dev/peps/pep-0008/#when-to-use-trailing-commas) +* [Comments](https://www.python.org/dev/peps/pep-0008/#comments) – Block comments, inline comments & documentation strings (docstrings) +* [Naming conventions](https://www.python.org/dev/peps/pep-0008/#naming-conventions) – Naming styles, naming conventions, names to avoid, ASCII compatibility, package and module names, class names, type variable names, exception names, global variable names, function and variable names, function and method arguments, method names and instance variables, constants & designing for inheritance +* [Programming recommendations](https://www.python.org/dev/peps/pep-0008/#programming-recommendations) – Function annotations & variable annotations + +We also use docstrings and we try to follow [`numpy`'s docstring standards](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard). + +Start reading our code to get a feel for it but most importantly, remember that this is open source software - consider the people who will read your code, and make it look nice for them. -* We use [PEP8](https://www.python.org/dev/peps/pep-0008/). Autoformatters for PEP8, for instance [autopep8](https://pypi.org/project/autopep8/), can easily ensure compliance. +* We use [PEP8](https://www.python.org/dev/peps/pep-0008/). Autoformatters for PEP8, for instance [black](https://black.readthedocs.io/en/stable/), can easily ensure compliance. * We use docstrings and we try to (loosely) follow [`numpy`'s docstring standards](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard). * This is open source software. Consider the people who will read your code, and make it look nice for them. - + +## Tests + +We do ask that you include some basic tests with your contributions. While the logic of your contribution is important, some basic unit tests to verify functionality and data types for the inputs are requested for a baseline level of assurance and 'elegant failing'. + +## Code of Conduct + +As a contributer you can help us keep the Codon community open and inclusive. Please read and follow our [Code of Conduct](https://github.com/codonlibrary/code-of-conduct/tree/master). By contributing to it, you agree to comply with it. + :clinking_glasses: Thank you! Team codon diff --git a/README.md b/README.md index bfb99ee..e46bc25 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,48 @@ -# Example package +# codonPython [![Build Status](https://travis-ci.com/codonlibrary/codonPython.svg?branch=master)](https://travis-ci.com/codonlibrary/codonPython) [![codecov](https://codecov.io/gh/codonlibrary/codonPython/branch/master/graph/badge.svg)](https://codecov.io/gh/codonlibrary/codonPython) -This is a simple example of a future package pulled from the DPS Core code base. -The package can be directly installed by typing in your terminal: `python -m pip install --user git+https://github.com/codonlibrary/codonPython.git` - -Include info on what each text file / folder is: -.github/ISSUE_TEMPLATE = -codonPython = -.gitignore = -.travis.yml = -CONTRIBUTING.md = -LICENSE = -MANIFEST.in = -README.md = -pytest.ini = -requirements.txt = -setup.py = +## What is `codon`? + +The `codon` project was created to increase code sharing, consistancy of code and coding standards and encourage collaboration. The repository contains Python, R and SQL code for ease of use in workflows on platforms used within NHS Digital such as Databricks. Package documentation is available on the [GitHub pages](https://codonlibrary.github.io/codonPython/) however further information for NHS Digital staff can be found [here](https://confluence.digital.nhs.uk/display/CON/Codon+-+Code+sharing), via our internal confluence page. `codonPython` aims to reduce the barrier for entry for analysis and provide software development experience for those at a higher level of technical ability. + +### Why `codon`? + +In biological terms, a `codon` is one of the building blocks that make up our DNA. By openly sharing our code we hope that others will be able to take those blocks of code to build their own processes using the data that NHS Digital hold. + +By sharing code, the project aims to: + +**Increase Transparency**: To align with government data principles and build public trust. + +**Improve Code**: To innovate and improve the code we use and provide. + +**Improve usability**: By increasing the accessibility and uniformity of code, it becomes easier for data users to find and use relevant code. + +**Be more cost effective**: Reusable 'generalised' code will increase efficiency in creating higher level processes. + + +## Installation +The package can be directly installed by typing in your terminal: +```r +python -m pip install --user git+https://github.com/codonlibrary/codonPython.git +``` +For further guidance on cloning from the remote repository to a local repository on your machine see the information in our [Wiki.](https://github.com/codonlibrary/codonPython/wiki/1.-Installing-codonPython) + +## Contributing to codonPython +All new contributions to `codon` are welcome; please follow the Coding Conventions in the [guidance document](https://github.com/codonlibrary/codonPython/blob/master/CONTRIBUTING.md) for contribution guidance. + +Any improvements to documentation, bug fixes or general code enhancements are also welcomed. If a bug is found on the master branch, please use the GitHub guidance on raising an [issue.](https://help.github.com/en/github/managing-your-work-on-github/creating-an-issue) + +## New to GitHub? +GitHub is a hosting site that allows for development and version control of software using Git. It allows users to edit and develop parts of code independently before submitting back to the master code, whilst using version control to track changes. Introductory videos to GitHub for beginners can be found [here.](https://github.com/codonlibrary/codonPython/wiki/2a.-GitHub-for-Beginners) + +Quick links to beginner guidance can also be found below: + +* [**Cloning a repository to your local machine using GitBash**](https://github.com/codonlibrary/codonPython/wiki/1.-Installing-codonPython) +* [**Checking out a branch using GitBash**](https://github.com/codonlibrary/codonPython/wiki/2b.-Checkout-a-branch-using-GitBash) +* [**Removing a Commit from a repository using GitBash**](https://github.com/codonlibrary/codonPython/wiki/3.-Removing-a-Commit-From-a-GitHub-Repository) + +All other `codon` "How-to Articles" can be found [here.](https://github.com/codonlibrary/codonPython/wiki/2.-Git-Guidance) + +Suggestions regarding additional guidance or How-to articles are welcome. diff --git a/codonPython/ODS_lookup.py b/codonPython/ODS_lookup.py new file mode 100644 index 0000000..e67df2c --- /dev/null +++ b/codonPython/ODS_lookup.py @@ -0,0 +1,97 @@ +import requests +from typing import Dict, Iterable, Callable, List, Optional +import pandas as pd +import numpy as np + + +def query_api(code: str) -> Dict: + """Query the ODS (organisation data service) API for a single org code + and return the full JSON result. Full API docs can be found here: + https://digital.nhs.uk/services/organisation-data-service/guidance-for-developers/organisation-endpoint + + Parameters + ---------- + code : str + 3 character organization code. + + Returns + ---------- + dict + The data returned from the API. + + Examples + --------- + >>> result = query_api("X26") + >>> result["Organisation"]["Name"] + 'NHS DIGITAL' + >>> result["Organisation"]["GeoLoc"]["Location"]["AddrLn1"] + '1 TREVELYAN SQUARE' + """ + if not isinstance(code, str): + raise ValueError(f"ODS code must be a string, received {type(code)}") + + response = requests.get( + f"https://directory.spineservices.nhs.uk/ORD/2-0-0/organisations/{code}" + ).json() + if "errorCode" in response: + error_code = response["errorCode"] + error_text = response["errorText"] + raise ValueError( + f"API query failed with code {error_code} and text '{error_text}'." + ) + return response + + +def get_addresses(codes: Iterable[str]) -> pd.DataFrame: + """Query the ODS (organisation data service) API for a series of + org codes and return a data frame containing names and addresses. + Invalid codes will cause a message to be printed but will + otherwise be ignored, as an incomplete merge table is more + useful than no table at all. + + Parameters + ---------- + codes : list, ndarray or pd.Series + 3 character organization codes to retrieve information for. + + Returns + ---------- + DataFrame + Address information for the given org codes. + + Examples + --------- + >>> result = get_addresses(pd.Series(["X26"])) + >>> result.reindex(columns=sorted(result.columns)) + Org_AddrLn1 Org_Code Org_Country Org_Name Org_PostCode Org_Town + 0 1 TREVELYAN SQUARE X26 ENGLAND NHS Digital LS1 6AE LEEDS + """ + + # Internal helper function to take the full result of a query + # and extract the relevant fields + def extract_data(api_result: Dict, code: str) -> Dict[str, str]: + org_info = api_result["Organisation"] + org_name = org_info["Name"] + org_address = org_info["GeoLoc"]["Location"] + result = { + "Org_Code": code, + "Org_Name": org_name.title().replace("Nhs", "NHS"), + **{f"Org_{k}": v for k, v in org_address.items() if k != "UPRN"}, + } + return result + + # Remove duplicate values + to_query = set(codes) + if np.nan in to_query: + # 'NaN' is actually a valid code but we don't want it for null values + to_query.remove(np.nan) + + result = [] + for code in to_query: + try: + api_result = query_api(code) + result.append(extract_data(api_result, code)) + except ValueError as e: + print(f"No result for ODS code {code}. {e}") + continue + return pd.DataFrame(result) diff --git a/codonPython/SQL_connections.py b/codonPython/SQL_connections.py new file mode 100644 index 0000000..f2cac3e --- /dev/null +++ b/codonPython/SQL_connections.py @@ -0,0 +1,33 @@ +''' Author(s): Sam Hollings +Desc: this module contains SQL_alchemy engines to connect to commonly used databases''' + +from sqlalchemy import create_engine + + +def conn_dss(): + '''Returns sqlalchemy Engine to connect to the DSS 2008 server (DMEDSS) DSS_CORPORATE database ''' + engine = create_engine('mssql+pyodbc://DMEDSS/DSS_CORPORATE?driver=SQL+Server') + return engine + + +def conn_dss2016uat(): + '''Returns sqlalchemy Engine to connect to the DSS 2016 server (UAT) (DSSUAT) DSS_CORPORATE database ''' + conn = create_engine('mssql+pyodbc://DSSUAT/DSS_CORPORATE?driver=SQL+Server') + return conn + + +def conn_dummy(path=r''): + '''connect to the sqlite3 database in memory, or at specified path + parameters + ---------- + path : string + The location and file in which the database for conn_dummy will be stored. Default is memory (RAM) + ''' + + conn_string = 'sqlite://' + if path != '': + path = '/' + path + + conn = create_engine(r'{0}{1}'.format(conn_string, path)) + + return conn diff --git a/codonPython/age_bands.py b/codonPython/age_bands.py index ff62bab..ce1e28b 100644 --- a/codonPython/age_bands.py +++ b/codonPython/age_bands.py @@ -5,7 +5,7 @@ def age_band_5_years(age: int) -> str: """ Place age into appropriate 5 year band - This function takes the age supplied as an argument and returns a string + This function takes the age supplied as an argument and returns a string representing the relevant 5 year banding. Parameters @@ -29,26 +29,26 @@ def age_band_5_years(age: int) -> str: """ if age is None: - return 'Age not known' + return "Age not known" if age >= 90: if age >= 150: raise ValueError("The age input: {} is too large.".format(age)) else: - return '90 and over' + return "90 and over" elif age < 0: raise ValueError("The age input: {} is too low.".format(age)) else: lowerbound = 5 * int(math.floor(age / 5)) upperbound = lowerbound + 4 - return '{}-{}'.format(lowerbound, upperbound) + return "{}-{}".format(lowerbound, upperbound) def age_band_10_years(age: int) -> str: """ Place age into appropriate 10 year band - This function takes the age supplied as an argument and returns a string + This function takes the age supplied as an argument and returns a string representing the relevant 10 year banding. Parameters @@ -72,16 +72,16 @@ def age_band_10_years(age: int) -> str: """ if age is None: - return 'Age not known' + return "Age not known" if age >= 90: if age >= 150: raise ValueError("The age input: {} is too large.".format(age)) else: - return '90 and over' + return "90 and over" elif age < 0: raise ValueError("The age input: {} is too low.".format(age)) else: lowerbound = 10 * int(math.floor(age / 10)) upperbound = lowerbound + 9 - return '{}-{}'.format(lowerbound, upperbound) + return "{}-{}".format(lowerbound, upperbound) diff --git a/codonPython/file_utils.py b/codonPython/file_utils.py new file mode 100644 index 0000000..83ae2cc --- /dev/null +++ b/codonPython/file_utils.py @@ -0,0 +1,332 @@ +import pandas as pd +import os + + +def file_search(path=".", doctype="csv", like=[""], strict=False): + """ + This function creates a list of all files of a certain type, satisfying the criteria outlined + in like = [...] parameter. The function only searches for files in the specified folder + of the current working directory that is set by the user. + + Parameters + ----------- + path : string + Path to a folder in the current working directory + default = '.', i.e. current working directory folder + doctype : string + Document format to search for + e.g. 'csv' or 'xlsx' + default = 'csv' + like : list + A list of words to filter the file search on + default = [''], i.e. no filter + strict : bool + Set True to search for filenames containing all words from 'like' list ( + default = False + + Returns + ------- + list + + Examples + ------- + >>> file_search(doctype = 'md') + ['README.md', 'CONTRIBUTING.md'] + + >>> file_search(doctype = 'md', like = ['READ']) + ['README.md'] + + """ + + if not isinstance(path, str): + raise ValueError("Please input path as a string") + elif not isinstance(doctype, str): + raise ValueError("Please input doctype as a string") + elif not isinstance(like, list): + raise ValueError("Please input like as a list") + elif not isinstance(strict, bool): + raise ValueError("Please input strict as a bool") + else: + pass + + list_of_files = [] + + if strict is False: + for file in os.listdir(path): + if (file.split(".")[-1] == doctype) & (any(x in file for x in like)): + list_of_files.append(file) + else: + for file in os.listdir(path): + if (file.split(".")[-1] == doctype) & (all(x in file for x in like)): + list_of_files.append(file) + + return list_of_files + + +def import_files( + path=".", doctype="csv", sheet="Sheet1", subdir=False, like=[""], strict=False +): + """ + This function imports all documents of a given format to a dictionary + and returns this dictionary, keeping original file names. + + Parameters + ---------- + path : string + Path to a folder in the current working directory + default = '.', i.e. current working directory folder + doctype : string + Document format to search for + e.g. 'csv' or 'xlsx' + default = 'csv' + sheet : string + Sheet name of the xlsx file + default = 'Sheet1' + subdir : bool + True to allow download all files, including the subdirectories + default = False + like : list + A list of words to filter the file search on + default = [''], i.e. no filter + strict : bool + Set True to search for filenames containing all words from 'like' list + default = False + + Returns + ------- + out : dict + + Examples + -------- + + '>>> import_files()' + + File Data_AprF_2019 is successfully imported + + File Data_AugF_2019 is successfully imported + + File Data_JulF_2019 is successfully imported + + File Data_JunF_2019_v1 is successfully imported + + File Data_MayF_2019 is successfully imported + + File Data_SepP_2019 is successfully imported + + '>>> import_files(like = ['Aug','Sep'])' + + File Data_AugF_2019 is successfully imported + + File Data_SepP_2019 is successfully imported + + + """ + + if not isinstance(path, str): + raise ValueError("Please input path as a string") + elif not isinstance(doctype, str): + raise ValueError("Please input doctype as a string") + elif not isinstance(sheet, str): + raise ValueError("Please input sheet as a string") + elif not isinstance(subdir, bool): + raise ValueError("Please input subdir as a bool") + elif not isinstance(like, list): + raise ValueError("Please input like as a list") + elif not isinstance(strict, bool): + raise ValueError("Please input strict as a bool") + else: + pass + + dict_files = {} + if subdir is True: + + for r, d, f in os.walk(path): + for file in f: + b = any(x in file for x in like) + if strict is True: + b = all(x in file for x in like) + if (file.split(".")[-1] == doctype) & (b is True): + k = file.strip("." + doctype) + try: + name = os.path.join(r, file) + print("\nImporting " + k + "...", end="", flush=True) + if doctype == "csv": + dict_files[name.strip(".\\").strip(".csv")] = pd.read_csv( + name + ) + print("\rFile " + k + " is successfully imported") + else: + dict_files[ + name.strip(".\\").strip(".xlsx") + ] = pd.read_excel(name, sheet_name=sheet) + print("\rFile " + k + " is successfully imported") + except Exception as ex: + raise (ex) + else: + for file in os.listdir(path): + b = any(x in file for x in like) + if strict is True: + b = all(x in file for x in like) + + if (file.split(".")[-1] == doctype) & (b is True): + k = file.strip("." + doctype) + try: + name = os.path.join(path, file) + print("\nImporting " + k + "...", end="", flush=True) + if doctype == "csv": + dict_files[k] = pd.read_csv(name) + print("\rFile " + k + " is successfully imported") + else: + dict_files[k] = pd.read_excel(name, sheet_name=sheet) + print("\rFile " + k + " is successfully imported") + except Exception as ex: + raise (ex) + + return dict_files + + +def compare(x, y, names=["x", "y"], dups=False, same=False, comment=False): + """ + This function returns a dictionary with: + + 1. Same values between data frames x and y + 2. Values in x, not in y + 3. Values in y, not in x + + (optional): + (4) Duplicates of x + (5) Duplicates of y + (6) Boolean of whether x and y are the same + + Parameters + ---------- + x : pandas.DataFrame + DataFrame #1 + y : pandas.DataFrame + DataFrame #2 + names : list + a list of user preferred file names + e.g. ['File1', 'File2'] + default = ['x','y'] + dups : bool + True to include duplicates check for each file + default = False + same : bool + True to activate. Outputs True if DataFrames are the same + default = False + comment : bool + True to activate. Prints out statistics of the compariosn results + e.g. number of same valeus, number of duplicates, number of outliers and whether the DataFrames are the same + default = False + + Returns + ------- + out : dict + + Examples + -------- + + '>>> c = compare(df1, df2, names = ['df1','df2'], dups = True, same = True, comment =True)' + + There are 133891 same values + There are 16531 outliers in df1 + There are 20937 outliers in df2 + There are 48704 duplicates in df1 + There are 0 duplicates in df2 + The DataFrames are not the same + + '>>> c = compare(df2, df2, names = ['df2','df2'], dups = True, same = True, comment =True)' + + There are 154444 same values + There are 0 outliers in df2 + There are 0 outliers in df2 + There are 0 duplicates in df2 + There are 0 duplicates in df2 + The DataFrames are the same + """ + + if not isinstance(x, pd.DataFrame): + raise ValueError("Please input x as a pandas.DataFrame") + elif not isinstance(y, pd.DataFrame): + raise ValueError("Please input y as a pandas.DataFrame") + elif not isinstance(names, list): + raise ValueError("Please input names as a list") + elif not isinstance(dups, bool): + raise ValueError("Please input dups as a bool") + elif not isinstance(same, bool): + raise ValueError("Please input same as a bool") + elif not isinstance(comment, bool): + raise ValueError("Please input comment as a bool") + + dict_temp = {} + + try: + dict_temp["same_values"] = pd.merge( + x.drop_duplicates(), y.drop_duplicates(), how="inner" + ) + except Exception as ex: + raise (ex) + try: + dict_temp[names[0] + "_not_" + names[1]] = pd.concat( + [x, dict_temp["same_values"]], ignore_index=True + ).drop_duplicates(keep=False) + dict_temp[names[1] + "_not_" + names[0]] = pd.concat( + [y, dict_temp["same_values"]], ignore_index=True + ).drop_duplicates(keep=False) + except Exception as ex: + raise (ex) + + if dups is True: + try: + dict_temp[names[0] + "_dups"] = x[x.duplicated()] + dict_temp[names[1] + "_dups"] = y[y.duplicated()] + except Exception as ex: + raise (ex) + if same is True: + try: + if (x.shape == y.shape) & (x.shape == dict_temp["same_values"].shape): + dict_temp["Same"] = True + else: + dict_temp["Same"] = False + except Exception as ex: + raise (ex) + try: + if comment is True: + print( + "\nThere are " + str(dict_temp["same_values"].shape[0]) + " same values" + ) + print( + "There are " + + str(dict_temp[names[0] + "_not_" + names[1]].shape[0]) + + " outliers in " + + str(names[0]) + ) + print( + "There are " + + str(dict_temp[names[1] + "_not_" + names[0]].shape[0]) + + " outliers in " + + str(names[1]) + ) + if dups is True: + print( + "There are " + + str(dict_temp[names[0] + "_dups"].shape[0]) + + " duplicates in " + + names[0] + ) + print( + "There are " + + str(dict_temp[names[1] + "_dups"].shape[0]) + + " duplicates in " + + names[1] + ) + if same is True: + if dict_temp["Same"] is True: + s = "the same" + else: + s = "not the same" + print("DataFrames are " + s) + except Exception as ex: + raise (ex) + + return dict_temp diff --git a/codonPython/mesh/__init__.py b/codonPython/mesh/__init__.py new file mode 100644 index 0000000..3b24788 --- /dev/null +++ b/codonPython/mesh/__init__.py @@ -0,0 +1,10 @@ +from .mesh import MESHConnection, generate_authorization +from .exceptions import ( + MESHAuthenticationError, + MESHDownloadErrors, + MESHInvalidRecipient, + MESHMessageAlreadyDownloaded, + MESHMessageMissing, + MESHMultipleMatches, + MESHUnknownError, +) diff --git a/codonPython/mesh/exceptions.py b/codonPython/mesh/exceptions.py new file mode 100644 index 0000000..9ad7c0c --- /dev/null +++ b/codonPython/mesh/exceptions.py @@ -0,0 +1,56 @@ +from requests.exceptions import ConnectionError + + +class MESHAuthenticationError(ConnectionError): + """The MESH request authentication was invalid""" + + @property + def msg(self): + return "Invalid authentication" + + +class MESHMessageMissing(ConnectionError): + """The message requested does not exist""" + + @property + def msg(self): + return "Message does not exist" + + +class MESHMessageAlreadyDownloaded(ConnectionError): + """The MESH request has already been downloaded""" + + @property + def msg(self): + return "Message already downloaded" + + +class MESHDownloadErrors(Exception): + """There were errors downloading MESH messages""" + + def __init__(self, exceptions): + self.exceptions = exceptions + + +class MESHInvalidRecipient(ConnectionError): + """The recipient is unknown or otherwise invalid""" + + @property + def msg(self): + return "Invalid recipient" + + +class MESHMultipleMatches(ConnectionError): + """There are multiple messages with the provided local ID""" + + @property + def msg(self): + return "Multiple messages found" + + +class MESHUnknownError(ConnectionError): + """There was an unknown error with the connection""" + + @property + def msg(self): + return "Unknown" diff --git a/codonPython/mesh/mesh.py b/codonPython/mesh/mesh.py new file mode 100644 index 0000000..a590595 --- /dev/null +++ b/codonPython/mesh/mesh.py @@ -0,0 +1,879 @@ +import platform +from dataclasses import dataclass +from datetime import datetime +from .exceptions import ( + MESHAuthenticationError, + MESHDownloadErrors, + MESHInvalidRecipient, + MESHMessageAlreadyDownloaded, + MESHMessageMissing, + MESHMultipleMatches, + MESHUnknownError, +) +from gzip import compress, decompress +from hashlib import md5 +from hmac import new as hmac +from math import ceil +from os import path +from uuid import uuid4 +import logging +from typing import Generator, Union + +import requests as r + + +@dataclass +class MESHConnection: + """Class for handling MESH API interactions. + + Parameters + ---------- + mailbox : string + The MESH ID of the mailbox this client is for + password : string + The password to this mailbox + api_shared_key : string + The shared API key for the MESH environment the mailbox is in + cert_loc : string + Path to the MESH API certificate location + key_loc : string + Path to the MESH API certificate private key location + base_ca_loc : string + Path to the base MESH certificate authority certificate bundle. + Set to False to disable inbound SSL checks if necessary + root_url : string, default = "https://mesh-sync.national.ncrs.nhs.uk" + Root MESH URL. Default value is the live MESH service + org : string, default = "NHS Digital" + Name of organisation owning the mailbox + """ + + mailbox: str + password: str + api_shared_key: str + cert_loc: str + key_loc: str + base_ca_loc: str + root_url: str = "https://mesh-sync.national.ncrs.nhs.uk" + org: str = "NHS Digital" + + def check_authentication(self) -> bool: + """ + Check authentication with the MESH API. + This should be done at the start of any session (per the API docs) + + Returns + ---------- + bool + Indicates if authentication was successful or not + + Raises + ---------- + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.check_authentication() #doctest: +SKIP + True + """ + resp = r.post( + f"{self.root_url}/messageexchange/{self.mailbox}", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ), + "Mex-ClientVersion": f"pyMESHAPI0.1a", + "Mex-OSArchitecture": platform.machine(), + "Mex-OSName": platform.system(), + "Mex-OSVersion": platform.version(), + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + return False + if resp.status_code == 200: + return True + raise MESHUnknownError(response=resp) + + def send_file( + self, + dest_mailbox: str, + message_location: str, + workflow_id: str, + message_subject: str = None, + message_id: str = None, + process_id: str = None, + compress_message: bool = True, + encrypted: bool = False, + ): + """ + Send a file to the MESH API. + This will automatically chunk the message if required, splitting into chunks at 80MB (MESH API has a + chunk size limit of 100MB). If required, this will also compress the message before transmission using + gzip. + + Parameters + ---------- + dest_mailbox : string + MESH Mailbox ID of the recipient + message_location : string + Path to the readable file to send as a message + workflow_id : string + DTS Workflow ID + message_subject : string, default = None + Optional subject line to use for the message, for SMTP (email) messages. + message_id : string, default = None + Optional local identifier for the message. Required to track the message later. + process_id : string, default = None + Optional process ID for the MESH message. Currently not used in MESH, but included to ensure + future compatibility. + compress_message : boolean, default = True + Indicates if the message should be compressed. If true, then the message will be compressed + using gzip before sending to MESH. + encrypted : boolean, default = False + Indicates if the file to send has been encrypted. This is solely used to pass a flag to MESH + and does not encrypt the file or otherwise alter processing. + + Returns + ---------- + dict + Dictionary of returned values from the MESH API + + * messageID (str): value of the MESH internal ID assigned to the sent message + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHInvalidRecipient + The mailbox ID provided is not a valid recipient for this message + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.send_file("TEST", 'c:/test/test.txt', 'test_flow') #doctest: +SKIP + {'messageID': '20200211115928515346_9359E2'} + """ + with open(message_location, "rb") as file: + message = file.read() + filename = path.basename(message_location) + return self.send_message( + dest_mailbox=dest_mailbox, + message=message, + filename=filename, + workflow_id=workflow_id, + message_subject=message_subject, + message_id=message_id, + process_id=process_id, + compress_message=compress_message, + encrypted=encrypted, + ) + + def send_message( + self, + dest_mailbox: str, + message: bytes, + filename: str, + workflow_id: str, + message_subject: str = None, + message_id: str = None, + process_id: str = None, + compress_message: bool = True, + encrypted: bool = False, + ): + """ + Send a message to the MESH API. + This will automatically chunk the message if required, splitting into chunks at 80MB (MESH API has a + chunk size limit of 100MB). If required, this will also compress the message before transmission using + gzip. + + Parameters + ---------- + dest_mailbox : string + MESH Mailbox ID of the recipient + message : bytes + Bytes representation of the file to transmit + filename : string + Original filename for the message being transmitted + workflow_id : string + DTS Workflow ID + message_subject : string, default = None + Optional subject line to use for the message, for SMTP (email) messages. + message_id : string, default = None + Optional local identifier for the message. Required to track the message later. + process_id : string, default = None + Optional process ID for the MESH message. Currently not used in MESH, but included to ensure + future compatibility. + compress_message : boolean, default = True + Indicates if the message should be compressed. If true, then the message will be compressed + using gzip before sending to MESH. + encrypted : boolean, default = False + Indicates if the file to send has been encrypted. This is solely used to pass a flag to MESH + and does not encrypt the file or otherwise alter processing. + + Returns + ---------- + dict + Dictionary of returned values from the MESH API + + * messageID (str): value of the MESH internal ID assigned to the sent message + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHInvalidRecipient + The mailbox ID provided is not a valid recipient for this message + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.send_message("TEST", b'test', 'test.txt', 'test_flow') #doctest: +SKIP + {'messageID': '20200211115928515346_9359E2'} + """ + checksum = md5(message).hexdigest() + if compress_message: + message = compress(message) + + headers = { + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ), + "Content-Type": "application/octet-stream", + "Mex-From": self.mailbox, + "Mex-To": dest_mailbox, + "Mex-WorkflowID": workflow_id, + "Mex-Filename": filename, + "Mex-MessageType": "DATA", + "Mex-Version": "1.0", + "Mex-Checksum": f"md5 {checksum}", + } + + if process_id is not None: + headers["Mex-ProcessID"] = process_id + if message_id is not None: + headers["Mex-LocalID"] = message_id + if compress_message: + headers["Mex-Content-Compressed"] = "Y" + headers["Content-Encoding"] = "gzip" + if encrypted: + headers["Mex-Content-Encrypted"] = "Y" + if message_subject is not None: + headers["Mex-Subject"] = message_subject + if len(message) > 80000000: + headers["Mex-Chunk-Range"] = f"1:{ceil(len(message)/80000000)}" + + if len(message) > 80000000: + resp = r.post( + url=f"{self.root_url}/messageexchange/{self.mailbox}/outbox", + data=message[0:80000000], + headers=headers, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code == 417: + raise MESHInvalidRecipient(response=resp) + if resp.status_code != 202: + raise MESHUnknownError(response=resp) + message_id = resp.json()["messageID"] + for chunk in range(2, ceil(len(message) / 80000000) + 1): + self._send_message_chunk( + message_id=message_id, + message_chunk=message[(chunk - 1) * 80000000:chunk * 80000000], + chunk_no=chunk, + chunk_range=ceil(len(message) / 80000000), + compressed=compress_message, + ) + else: + resp = r.post( + url=f"{self.root_url}/messageexchange/{self.mailbox}/outbox", + data=message, + headers=headers, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code == 417: + raise MESHInvalidRecipient(response=resp) + if resp.status_code != 202: + raise MESHUnknownError(response=resp) + + return resp.json() + + def _send_message_chunk( + self, + message_id: str, + message_chunk: bytes, + chunk_no: int, + chunk_range: int, + compressed: bool = True, + ) -> None: + """ + Send a message chunk to the MESH API. + This is expected to only be called by the send_message method. + + Parameters + ---------- + message_id : string + The internal MESH ID of the message to upload a chunk for + message_chunk : bytes + The data to send in this chunk + chunk_no : integer + The number of the chunk to upload + chunk_range : integer + How many chunks there are to upload in total + compressed : boolean, default = True + Is the message compressed? + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client._send_message_chunk("20200211115754892283_BC7B68", b'test', 2) #doctest: +SKIP + """ + headers = { + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ), + "Mex-From": self.mailbox, + "Content-Type": "application/octet-stream", + "Mex-Chunk-Range": f"{chunk_no}:{chunk_range}", + } + if compressed: + headers["Content-Encoding"] = "gzip" + resp = r.post( + url=f"{self.root_url}/messageexchange/{self.mailbox}/outbox/{message_id}/{chunk_no}", + data=message_chunk, + headers=headers, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code != 202: + raise MESHUnknownError(response=resp) + + def check_message_status(self, message_id: str) -> dict: + """ + Check status of a sent message. + + Parameters + ---------- + message_id : string + The local message ID, eg. as provided to send_message. Does NOT work with MESH Message IDs, only + the local ID optionally provided on sending the message. + + Returns + ---------- + dict + The full response from the MESH API for this local ID. For details, consult the MESH API documentation + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHMultipleMatches + There are multiple messages in the outbox with this local ID + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.check_message_status(test) #doctest: +SKIP + {"statusSuccess": ...} + """ + resp = r.get( + url=f"{self.root_url}/messageexchange/{self.mailbox}/outbox/tracking/{message_id}", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ) + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + # There is an error in the API itself - in case of multiple match + # will send an error page with status 200 instead of 300 + if (resp.status_code == 300) or ( + resp.text + == "300: Multiple Choices300: Multiple Choices" + ): + raise MESHMultipleMatches(response=resp) + if resp.status_code == 404: + raise MESHMessageMissing(response=resp) + if resp.status_code != 200: + raise MESHUnknownError(response=resp) + return resp.json() + + def check_inbox(self) -> list: + """ + Determine the MESH IDs of the contents of the inbox. + This will return at most 500 entries, owing to the limitations of the API. + + Returns + ---------- + list + The MESH IDs of the messages in the inbox (str) + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.check_inbox() #doctest: +SKIP + ["20200211115754892283_BC7B68", "20200211115928515346_9359E2"] + """ + resp = r.get( + url=f"{self.root_url}/messageexchange/{self.mailbox}/inbox", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ) + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code != 200: + raise MESHUnknownError(response=resp) + return resp.json()["messages"] + + def check_inbox_count(self) -> int: + """ + Determine how many messages are in the MESH mailbox to download. + + Returns + ---------- + int + The number of messages ready to download + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.check_inbox_count() #doctest: +SKIP + 2 + """ + resp = r.get( + url=f"{self.root_url}/messageexchange/{self.mailbox}/count", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ) + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code != 200: + raise MESHUnknownError(response=resp) + return resp.json()["count"] + + def check_and_download( + self, save_folder: str = None, recursive: bool = True + ) -> Union[Generator[dict, None, None], None]: + """ + Download all messages in the inbox. + This will automatically handle reconstructing chunked messages, and automatically decompress any messages + which have Content-Encoding value of gzip. + WARNING: each downloaded message will be fully reconstructed and decompressed if needed. This may cause + issue for machines with very limited memory if there are very large files to download. + + If save_folder is provided, then downloaded files will be saved into that folder with their original filenames + (and non-delivery receipts will be saved there). This may cause issue if there are multiple files with the + same filename. + + If no save_folder is provided, then this function will return a generator which will yield each message in turn. + When the generator yields a message, it will send an acknowledgement to the MESH API for the previous + message; it is important that processing of the messages be complete and any required final outputs saved + before this - once acknowledged a message cannot be downloaded from MESH again. + + Parameters + ---------- + save_folder : string, default = None + If provided, the folder to save all downloaded files to when this function is called. The function + will not yield intermediate results. + + * For data files, the file will be saved in this folder with its original filename. + * For non-delivery reports, there will be a file created in the folder with filename + 'Non delivery report: (MESH message ID of failed delivery).txt', and with + content 'Message not delivered. All known details below' followed by the full + dictionary of headers from the download response. + + If not provided, then this function will instead yield results as documented below. + recursive : boolean, default = True + If true, then this method will be called recursively so long as there are more than 500 messages + in the inbox, the maximum number of messages the MESH API will provide IDs for at once. If false, + then only one call will be made to retrieve inbox contents, and at most 500 messages will be downloaded. + + Yields + ---------- + dict + Dictionary of details about the downloaded file. + + * filename (str): Filename of the original file (if provided). + * contents (bytes): Contents of the file (reconstructed and decompressed if necessary). + * headers (dict): Dictionary of headers returned by MESH on the initial download request. + For full details see the MESH API documentation. + * datafile (boolean): Indicates if this was a data file or a non-delivery report. + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing the inbox. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHUnknownError + There was an unexpected return status from the MESH API when accessing the inbox + MESHDownloadErrors + There were errors during the download process. This exception has the attribute 'exceptions', + which contains a full list of messages which generated exceptions, along with the exception. + This is only raised after completion of all non-error downloads, and downloads which raise + an exception are not acknowledged to the MESH API. + + Examples + ---------- + >>> client.check_and_download("C:/Test Folder/") #doctest: +SKIP + >>> for message in client.check_and_download(): #doctest: +SKIP + >>> print(message) #doctest: +SKIP + {'filename': 'test.txt', 'contents': b'test_message', 'headers': {...}, datafile: True} + {'filename': 'test2.txt', 'contents': b'test_message_2', 'headers': {...}, datafile: True} + {'filename': None, 'contents': b'', 'headers': {'Mex-Linkedmsgid': '1234567890', ...}, datafile: False} + """ + if save_folder is None: + return self._check_download_generator(recursive) + else: + self._check_download_save(save_folder, recursive) + + def _check_download_generator(self, recursive: bool) -> Generator[dict, None, None]: + """Internal only - generator to return for check_and_download""" + message_ids = self.check_inbox() + exceptions = [] + if recursive: + repeat_needed = self.check_inbox_count() > 500 + for message_id in message_ids: + try: + yield self.download_message(message_id, save_folder=None) + except Exception as e: + exceptions.append((message_id, e)) + else: + self.ack_download_message(message_id) + # Force termination if there are enough messages failing to download that they fill the inbox + # Reduces risk of infinite loops + if len(exceptions) >= 500: + raise MESHDownloadErrors(exceptions) + if recursive and repeat_needed: + try: + for msg in self._check_download_generator(recursive=True): + yield msg + except MESHDownloadErrors as e: + exceptions.extend(e.exceptions) + if exceptions: + raise MESHDownloadErrors(exceptions) + + def _check_download_save(self, save_folder: str, recursive: bool) -> None: + """Internal only - function to save results for check_and_download""" + message_ids = self.check_inbox() + exceptions = [] + if recursive: + repeat_needed = self.check_inbox_count() > 500 + for message_id in message_ids: + try: + self.download_message(message_id, save_folder) + except Exception as e: + exceptions.append((message_id, e)) + else: + self.ack_download_message(message_id) + # Force termination if there are enough messages failing to download that they fill the inbox + # Reduces risk of infinite loops + if len(exceptions) >= 500: + raise MESHDownloadErrors(exceptions) + if recursive and repeat_needed: + try: + self._check_download_save(save_folder, recursive=True) + except MESHDownloadErrors as e: + exceptions.extend(e.exceptions) + if exceptions: + raise MESHDownloadErrors(exceptions) + + def download_message(self, message_id: str, save_folder: str = None) -> dict: + """ + Request a message from the MESH API. + This will automatically handle reconstructing chunked messages, and automatically decompress any messages + which have Content-Encoding value of gzip. + WARNING: the full, reconstructed message will be held in memory, including after decompression. This may + cause problems, if you are using the API to download very large files on a machine with very limited memory. + + Parameters + ---------- + message_id : string + The internal MESH ID of the message to download + save_folder : string, default = None + Optional, the folder to save the downloaded message to. If not provided, then the files are not saved. + + * For data files, the file will be saved in this folder with its original filename. + * For non-delivery reports, there will be a file created in the folder with filename + 'Non delivery report: (MESH message ID of failed delivery).txt', and with + content 'Message not delivered. All known details below' followed by the full + dictionary of headers from the download response. + + Returns + ---------- + dict + Dictionary of details about the downloaded file. + + * filename (str): Filename of the original file (if provided). + * contents (bytes): Contents of the file (reconstructed and decompressed if necessary). + * headers (dict): Dictionary of headers returned by MESH on the initial download request. + For full details see the MESH API documentation. + * datafile (boolean): Indicates if this was a data file or a non-delivery report. + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHMessageMissing + There is no message with the provided message ID in the mailbox + MESHMessageAlreadyDownloaded + The message with the provided message ID has already been downloaded and acknowledged + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.download_message("20200211115754892283_BC7B68", "C:/Test Folder/") #doctest: +SKIP + {'filename': 'test.txt', 'contents': b'test_message', 'headers': {'Mex-Filename': 'test.txt', ...}, data: True} + >>> client.download_message("20200211115754892283_BC7B69") #doctest: +SKIP + {'filename': None, 'contents': b'', 'headers': {'Mex-Linkedmsgid': '1234567890', ...}, data: False} + """ + resp = r.get( + url=f"{self.root_url}/messageexchange/{self.mailbox}/inbox/{message_id}", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ), + "Accept-Encoding": "gzip", + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + stream=True, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + elif resp.status_code == 404: + raise MESHMessageMissing(response=resp) + elif resp.status_code == 410: + raise MESHMessageAlreadyDownloaded(response=resp) + elif resp.status_code == 206: + core_data = resp.raw.data + chunk_count = int(resp.headers["Mex-Chunk-Range"][2:]) + for chunk in range(2, chunk_count + 1): + core_data += self._download_message_chunk(message_id, chunk) + elif resp.status_code == 200: + core_data = resp.raw.data + else: + raise MESHUnknownError(response=resp) + + # If this header exists, the message is a non delivery report + if ("Mex-Linkedmsgid" in resp.headers) or ( + resp.headers["Mex-MessageType"] == "REPORT" + ): + logging.info( + f"Non delivery report for message {resp.headers['Mex-Linkedmsgid']}" + ) + if save_folder is not None: + with open( + path.join( + save_folder, + f"Non delivery report: {resp.headers['Mex-Linkedmsgid']}.txt", + ), + "w", + ) as file: + file.write( + "Message not delivered. All known details below\n" + + str(resp.headers) + ) + return { + "filename": resp.headers.get("Mex-Filename"), + "contents": resp.content, + "headers": resp.headers, + "datafile": False, + } + + if ("Content-Encoding" in resp.headers) and ( + resp.headers["Content-Encoding"] == "gzip" + ): + core_data = decompress(core_data) + + if save_folder is not None: + with open( + path.join(save_folder, resp.headers["Mex-Filename"]), "wb" + ) as file: + file.write(core_data) + return { + "filename": resp.headers["Mex-Filename"], + "contents": core_data, + "headers": resp.headers, + "datafile": True, + } + + def _download_message_chunk(self, message_id: str, chunk_no: int) -> bytes: + """ + Request a message chunk from the MESH API. + This is expected to only be called by the download_message method. + + Parameters + ---------- + message_id : string + The internal MESH ID of the message to download a chunk from + chunk_no : integer + The number of the chunk to download + + Returns + ---------- + bytes + The raw content of the downloaded chunk + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHMessageMissing + There is no message with the provided message ID in the mailbox + MESHMessageAlreadyDownloaded + The message with the provided message ID has already been downloaded and acknowledged + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client._download_message_chunk("20200211115754892283_BC7B68", 1) #doctest: +SKIP + b'test_message' + """ + resp = r.get( + url=f"{self.root_url}/messageexchange/{self.mailbox}/inbox/{message_id}/{chunk_no}", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ), + "Accept-Encoding": "gzip", + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + stream=True, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + elif resp.status_code == 404: + raise MESHMessageMissing(response=resp) + elif resp.status_code == 410: + raise MESHMessageAlreadyDownloaded(response=resp) + elif resp.status_code in (200, 206): + return resp.raw.data + else: + raise MESHUnknownError(response=resp) + + def ack_download_message(self, message_id: str) -> None: + """ + Send acknowledgement to the MESH API that a message has finished downloading. + This should only be done after the message has successfully been saved - + once sent, the message is remvoed from the MESH server. + Per the API, this must be sent once a message has been successfully processed. + + Parameters + ---------- + message_id : string + The internal MESH ID of the downloaded message + + Raises + ---------- + MESHAuthenticationError + There was an authentication error accessing this page. Either the SSL certificate used is invalid, + or the client provided the wrong Mailbox ID, Password, or Shared Key. + MESHUnknownError + There was an unexpected return status from the MESH API + + Examples + ---------- + >>> client.ack_download_message("20200211115754892283_BC7B68") #doctest: +SKIP + """ + resp = r.put( + url=f"{self.root_url}/messageexchange/{self.mailbox}/inbox/{message_id}/status/acknowledged", + headers={ + "Authorization": generate_authorization( + self.mailbox, self.password, self.api_shared_key + ) + }, + cert=(self.cert_loc, self.key_loc), + verify=self.base_ca_loc, + ) + if resp.status_code == 403: + raise MESHAuthenticationError(response=resp) + if resp.status_code != 200: + raise MESHUnknownError(response=resp) + + +def generate_authorization(mailbox: str, password: str, api_shared_key: str) -> str: + """ + Generate an authorization string as specified by the MESH API documentation v1.14 + + Parameters + ---------- + mailbox : string + The mailbox ID to generate authorization for + password : string + The password for the mailbox + api_shared_key : string + The shared API key for the MESH environment the request is being made to + + Returns + ---------- + string + The generated authentication string + + Examples + ---------- + >>> generate_authorization("TEST_BOX", "TEST_PW", "TEST_KEY") #doctest: +SKIP + "NHSMESH TEST_BOX:ccd54b96-ee41-4d34-9700-7f9ec63d0720:1:202002120857:763 ... 872c" + >>> generate_authorization("NEW_BOX", "NEW_PW", "TEST_KEY") #doctest: +SKIP + "NHSMESH NEW_BOX:662c4ffa-c85c-4858-bae8-7327e09aeeb5:1:202002120858:7f1 ... 0d95" + """ + nonce = uuid4() + time = datetime.now().strftime("%Y%m%d%H%M") + hash_out = hmac( + api_shared_key.encode(), + msg=f"{mailbox}:{nonce}:1:{password}:{time}".encode("utf8"), + digestmod="sha256", + ).hexdigest() + return f"NHSMESH {mailbox}:{nonce}:1:{time}:{hash_out}" diff --git a/codonPython/mesh/tests/conftest.py b/codonPython/mesh/tests/conftest.py new file mode 100644 index 0000000..98825e9 --- /dev/null +++ b/codonPython/mesh/tests/conftest.py @@ -0,0 +1,24 @@ +import pytest + + +def mock_generate_authorization(*args): + return "xxxauthorizationxxx" + + +@pytest.fixture +def mesh_connection(monkeypatch): + import codonPython.mesh as mesh + + monkeypatch.setattr( + mesh.mesh, "generate_authorization", mock_generate_authorization + ) + + return mesh.MESHConnection( + mailbox="TestMailboxId", + password="secret_password", + api_shared_key="api_shared_key", + cert_loc="keys/mesh.cert", + key_loc="keys/mesh.key", + base_ca_loc="keys/mesh.ca-bundle", + root_url="http://root", + ) diff --git a/codonPython/mesh/tests/test_ack_download_message.py b/codonPython/mesh/tests/test_ack_download_message.py new file mode 100644 index 0000000..cdc19d9 --- /dev/null +++ b/codonPython/mesh/tests/test_ack_download_message.py @@ -0,0 +1,33 @@ +import pytest + +import codonPython.mesh as mesh + + +def test_Ack_ValidRequest_CallsOnce(requests_mock, mesh_connection): + requests_mock.put( + url="http://root/messageexchange/TestMailboxId/inbox/1/status/acknowledged", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=200, + ) + mesh_connection.ack_download_message("1") + assert requests_mock.call_count == 1 + + +def test_Ack_403_RaisesAuthError(requests_mock, mesh_connection): + requests_mock.put( + url="http://root/messageexchange/TestMailboxId/inbox/1/status/acknowledged", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.ack_download_message("1") + + +def test_Ack_400_RaisesUnknownError(requests_mock, mesh_connection): + requests_mock.put( + url="http://root/messageexchange/TestMailboxId/inbox/1/status/acknowledged", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.ack_download_message("1") diff --git a/codonPython/mesh/tests/test_check_and_download.py b/codonPython/mesh/tests/test_check_and_download.py new file mode 100644 index 0000000..ee00a79 --- /dev/null +++ b/codonPython/mesh/tests/test_check_and_download.py @@ -0,0 +1,374 @@ +from itertools import chain +from os import path + +import pytest + +import codonPython.mesh as mesh + + +def mock_download(message_id, save_folder=None): + if save_folder is not None: + with open(path.join(save_folder, str(message_id)), "w") as file: + file.write(str(message_id)) + return { + "filename": message_id, + "contents": message_id, + "headers": {}, + "datafile": True, + } + + +def mock_download_fail_auth(message_id, save_folder=None): + raise mesh.MESHAuthenticationError + + +def mock_download_fail_gone(message_id, save_folder=None): + raise mesh.MESHMessageAlreadyDownloaded + + +def mock_download_chooser_factory(auth_ids, gone_ids): + def mock(message_id, save_folder=None): + if message_id in auth_ids: + return mock_download_fail_auth(message_id, save_folder) + if message_id in gone_ids: + return mock_download_fail_gone(message_id, save_folder) + return mock_download(message_id, save_folder) + + return mock + + +def mock_inbox_factory(outputs_list): + output_iter = (out for out in outputs_list) + + def mock_output(*args, **kwargs): + try: + return next(output_iter) + except StopIteration: + return [] + + return mock_output + + +def mock_count_factory(counts_list): + output_iter = (out for out in counts_list) + + def mock_output(*args, **kwargs): + try: + return next(output_iter) + except StopIteration: + return 0 + + return mock_output + + +class Tracker: + def __init__(self): + self.count = 0 + self.data = [] + + def inc(self, *args, **kwargs): + self.count += 1 + self.data.append((args, kwargs)) + + +@pytest.fixture +def track_ack(monkeypatch, mesh_connection): + tracker = Tracker() + monkeypatch.setattr(mesh_connection, "ack_download_message", tracker.inc) + return tracker + + +@pytest.fixture +def patch_valid(monkeypatch, mesh_connection): + monkeypatch.setattr(mesh_connection, "download_message", mock_download) + monkeypatch.setattr(mesh_connection, "check_inbox_count", mock_count_factory([3])) + monkeypatch.setattr( + mesh_connection, "check_inbox", mock_inbox_factory([["1", "2", "3"]]) + ) + return mesh_connection + + +def test_CheckDownload_DownloadsCorrectSave(patch_valid, track_ack, tmpdir): + p = tmpdir.mkdir("dl") + out = patch_valid.check_and_download(save_folder=str(p), recursive=False) + assert out is None + assert p.join("1").read() == "1" + assert p.join("2").read() == "2" + assert p.join("3").read() == "3" + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {})] + + +def test_CheckDownload_DownloadsCorrectGenerator(patch_valid, track_ack): + out = patch_valid.check_and_download(save_folder=None, recursive=False) + msg = next(out) + assert msg == {"filename": "1", "contents": "1", "headers": {}, "datafile": True} + msg = next(out) + assert msg == {"filename": "2", "contents": "2", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {})] + msg = next(out) + assert msg == {"filename": "3", "contents": "3", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {}), (("2",), {})] + with pytest.raises(StopIteration): + msg = next(out) + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {})] + + +@pytest.fixture +def patch_recurse(monkeypatch, mesh_connection): + monkeypatch.setattr(mesh_connection, "download_message", mock_download) + monkeypatch.setattr( + mesh_connection, "check_inbox_count", mock_count_factory([501, 501, 1]) + ) + monkeypatch.setattr( + mesh_connection, + "check_inbox", + mock_inbox_factory([["1", "2", "3"], ["4"], ["5"]]), + ) + return mesh_connection + + +def test_CheckDownload_NoRecurseSave(patch_recurse, track_ack, tmpdir): + p = tmpdir.mkdir("dl") + out = patch_recurse.check_and_download(save_folder=str(p), recursive=False) + assert out is None + assert p.join("1").read() == "1" + assert p.join("2").read() == "2" + assert p.join("3").read() == "3" + assert p.join("4").exists() is False + assert p.join("5").exists() is False + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {})] + + +def test_CheckDownload_RecurseSave(patch_recurse, track_ack, tmpdir): + p = tmpdir.mkdir("dl") + out = patch_recurse.check_and_download(save_folder=str(p), recursive=True) + assert out is None + assert p.join("1").read() == "1" + assert p.join("2").read() == "2" + assert p.join("3").read() == "3" + assert p.join("4").read() == "4" + assert p.join("5").read() == "5" + assert track_ack.data == [ + (("1",), {}), + (("2",), {}), + (("3",), {}), + (("4",), {}), + (("5",), {}), + ] + + +def test_CheckDownload_NoRecurseGen(patch_recurse, track_ack): + out = patch_recurse.check_and_download(save_folder=None, recursive=False) + msg = next(out) + assert msg == {"filename": "1", "contents": "1", "headers": {}, "datafile": True} + msg = next(out) + assert msg == {"filename": "2", "contents": "2", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {})] + msg = next(out) + assert msg == {"filename": "3", "contents": "3", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {}), (("2",), {})] + with pytest.raises(StopIteration): + msg = next(out) + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {})] + + +def test_CheckDownload_RecurseGen(patch_recurse, track_ack): + out = patch_recurse.check_and_download(save_folder=None, recursive=True) + msg = next(out) + assert msg == {"filename": "1", "contents": "1", "headers": {}, "datafile": True} + msg = next(out) + assert msg == {"filename": "2", "contents": "2", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {})] + msg = next(out) + assert msg == {"filename": "3", "contents": "3", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {}), (("2",), {})] + msg = next(out) + assert msg == {"filename": "4", "contents": "4", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {})] + msg = next(out) + assert msg == {"filename": "5", "contents": "5", "headers": {}, "datafile": True} + assert track_ack.data == [(("1",), {}), (("2",), {}), (("3",), {}), (("4",), {})] + with pytest.raises(StopIteration): + msg = next(out) + assert track_ack.data == [ + (("1",), {}), + (("2",), {}), + (("3",), {}), + (("4",), {}), + (("5",), {}), + ] + + +@pytest.fixture +def patch_errors(monkeypatch, mesh_connection): + monkeypatch.setattr( + mesh_connection, + "download_message", + mock_download_chooser_factory(["1", "2", "6"], ["3", "4", "9"]), + ) + monkeypatch.setattr( + mesh_connection, "check_inbox_count", mock_count_factory([501, 501, 1]) + ) + monkeypatch.setattr( + mesh_connection, + "check_inbox", + mock_inbox_factory([["1", "2", "3", "4", "5"], ["6", "7"], ["8", "9"]]), + ) + return mesh_connection + + +def test_CheckDownload_ErrorsNoRecurseSave(patch_errors, track_ack, tmpdir): + p = tmpdir.mkdir("dl") + with pytest.raises(mesh.MESHDownloadErrors) as exc: + patch_errors.check_and_download(save_folder=str(p), recursive=False) + assert exc.value.exceptions[0][0] == "1" + assert type(exc.value.exceptions[0][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[1][0] == "2" + assert type(exc.value.exceptions[1][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[2][0] == "3" + assert type(exc.value.exceptions[2][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[3][0] == "4" + assert type(exc.value.exceptions[3][1]) == mesh.MESHMessageAlreadyDownloaded + assert len(exc.value.exceptions) == 4 + + assert p.join("1").exists() is False + assert p.join("2").exists() is False + assert p.join("3").exists() is False + assert p.join("4").exists() is False + assert p.join("5").read() == "5" + assert track_ack.data == [(("5",), {})] + + +def test_CheckDownload_ErrorsNoRecurseGen(patch_errors, track_ack): + out = patch_errors.check_and_download(save_folder=None, recursive=False) + msg = next(out) + assert msg == {"filename": "5", "contents": "5", "headers": {}, "datafile": True} + with pytest.raises(mesh.MESHDownloadErrors) as exc: + msg = next(out) + assert exc.value.exceptions[0][0] == "1" + assert type(exc.value.exceptions[0][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[1][0] == "2" + assert type(exc.value.exceptions[1][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[2][0] == "3" + assert type(exc.value.exceptions[2][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[3][0] == "4" + assert type(exc.value.exceptions[3][1]) == mesh.MESHMessageAlreadyDownloaded + assert len(exc.value.exceptions) == 4 + + assert track_ack.data == [(("5",), {})] + + +def test_CheckDownload_ErrorsRecurseSave(patch_errors, track_ack, tmpdir): + p = tmpdir.mkdir("dl") + with pytest.raises(mesh.MESHDownloadErrors) as exc: + patch_errors.check_and_download(save_folder=str(p), recursive=True) + assert exc.value.exceptions[0][0] == "1" + assert type(exc.value.exceptions[0][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[1][0] == "2" + assert type(exc.value.exceptions[1][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[2][0] == "3" + assert type(exc.value.exceptions[2][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[3][0] == "4" + assert type(exc.value.exceptions[3][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[4][0] == "6" + assert type(exc.value.exceptions[4][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[5][0] == "9" + assert type(exc.value.exceptions[5][1]) == mesh.MESHMessageAlreadyDownloaded + assert len(exc.value.exceptions) == 6 + + assert p.join("1").exists() is False + assert p.join("2").exists() is False + assert p.join("3").exists() is False + assert p.join("4").exists() is False + assert p.join("5").read() == "5" + assert p.join("6").exists() is False + assert p.join("7").read() == "7" + assert p.join("8").read() == "8" + assert p.join("9").exists() is False + assert track_ack.data == [(("5",), {}), (("7",), {}), (("8",), {})] + + +def test_CheckDownload_ErrorsRecurseGen(patch_errors, track_ack): + out = patch_errors.check_and_download(save_folder=None, recursive=True) + msg = next(out) + assert msg == {"filename": "5", "contents": "5", "headers": {}, "datafile": True} + msg = next(out) + assert msg == {"filename": "7", "contents": "7", "headers": {}, "datafile": True} + assert track_ack.data == [(("5",), {})] + msg = next(out) + assert msg == {"filename": "8", "contents": "8", "headers": {}, "datafile": True} + assert track_ack.data == [(("5",), {}), (("7",), {})] + with pytest.raises(mesh.MESHDownloadErrors) as exc: + msg = next(out) + assert exc.value.exceptions[0][0] == "1" + assert type(exc.value.exceptions[0][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[1][0] == "2" + assert type(exc.value.exceptions[1][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[2][0] == "3" + assert type(exc.value.exceptions[2][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[3][0] == "4" + assert type(exc.value.exceptions[3][1]) == mesh.MESHMessageAlreadyDownloaded + assert exc.value.exceptions[4][0] == "6" + assert type(exc.value.exceptions[4][1]) == mesh.MESHAuthenticationError + assert exc.value.exceptions[5][0] == "9" + assert type(exc.value.exceptions[5][1]) == mesh.MESHMessageAlreadyDownloaded + assert len(exc.value.exceptions) == 6 + + assert track_ack.data == [(("5",), {}), (("7",), {}), (("8",), {})] + + +@pytest.fixture +def patch_many_errors(monkeypatch, mesh_connection): + # This is for testing the early abort of the recursion if we hit 500 failed messages in one fetch + # Failed messages will not be acknowledged, and will thus stay in the MESH system + # If the issue is inherent to the message, and the inbox is full of messages with these issues + # then we could enter an infinite loop without this abort + monkeypatch.setattr( + mesh_connection, + "download_message", + mock_download_chooser_factory( + list(chain(range(300), range(500, 800))), list(range(1000, 1500)) + ), + ) + monkeypatch.setattr( + mesh_connection, "check_inbox_count", mock_count_factory([501, 501, 501, 1]) + ) + monkeypatch.setattr( + mesh_connection, + "check_inbox", + mock_inbox_factory( + [range(500), range(500, 1000), range(1000, 1500), range(1500, 1501)] + ), + ) + return mesh_connection + + +def test_CheckDownload_ErrorsEarlyTerminateSave(patch_many_errors, tmpdir, track_ack): + p = tmpdir.mkdir("dl") + with pytest.raises(mesh.MESHDownloadErrors) as exc: + patch_many_errors.check_and_download(save_folder=p, recursive=True) + assert len(exc.value.exceptions) == 1100 + for e, index in zip( + exc.value.exceptions, chain(range(300), range(500, 800), range(1000, 1500)) + ): + assert e[0] == index + assert len(p.listdir()) == 400 + + +def test_CheckDownload_ErrorsEarlyTerminateGen(patch_many_errors, track_ack): + with pytest.raises(mesh.MESHDownloadErrors) as exc: + for msg, index in zip( + patch_many_errors.check_and_download(save_folder=None, recursive=True), + chain(range(300, 500), range(800, 1000)), + ): + assert msg == { + "filename": index, + "contents": index, + "headers": {}, + "datafile": True, + } + assert len(exc.value.exceptions) == 1100 + for e, index in zip( + exc.value.exceptions, chain(range(300), range(500, 800), range(1000, 1500)) + ): + assert e[0] == index diff --git a/codonPython/mesh/tests/test_check_authentication.py b/codonPython/mesh/tests/test_check_authentication.py new file mode 100644 index 0000000..60dbbf3 --- /dev/null +++ b/codonPython/mesh/tests/test_check_authentication.py @@ -0,0 +1,51 @@ +import pytest + +import codonPython.mesh as mesh + + +def test_CheckAuthentication_ValidRequest_ReturnsTrue(requests_mock, mesh_connection): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId", status_code=200, + ) + test_check_authentication = mesh_connection.check_authentication() + assert requests_mock.call_count == 1 + assert test_check_authentication + + +def test_CheckAuthentication_HasRequiredHeaders(requests_mock, mesh_connection): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=200, + ) + mesh_connection.check_authentication() + assert requests_mock.call_count == 1 + assert all( + header in requests_mock.request_history[0].headers + for header in [ + "Mex-ClientVersion", + "Mex-OSArchitecture", + "Mex-OSName", + "Mex-OSVersion", + ] + ) + + +def test_CheckAuthentication_403StatusCode_ReturnsFalse(requests_mock, mesh_connection): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId", status_code=403, + ) + test_check_authentication = mesh_connection.check_authentication() + assert requests_mock.call_count == 1 + assert not test_check_authentication + + +def test_CheckAuthentication_400StatusCode_ReturnsUnknownError( + requests_mock, mesh_connection +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId", status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.check_authentication() + assert requests_mock.call_count == 1 diff --git a/codonPython/mesh/tests/test_check_inbox.py b/codonPython/mesh/tests/test_check_inbox.py new file mode 100644 index 0000000..ba1ba0c --- /dev/null +++ b/codonPython/mesh/tests/test_check_inbox.py @@ -0,0 +1,39 @@ +import pytest + +import codonPython.mesh as mesh + + +def test_CheckInbox_ValidRequest_ReturnsJson(requests_mock, mesh_connection): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=200, + json={"messages": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, + ) + test_check_inbox_count = mesh_connection.check_inbox() + assert requests_mock.call_count == 1 + assert test_check_inbox_count == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + +def test_CheckInbox_403StatusCode_ReturnsAuthenticationError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.check_inbox() + assert requests_mock.call_count == 1 + + +def test_CheckInbox_400StatusCode_ReturnsUnknownError(requests_mock, mesh_connection): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.check_inbox() + assert requests_mock.call_count == 1 diff --git a/codonPython/mesh/tests/test_check_inbox_count.py b/codonPython/mesh/tests/test_check_inbox_count.py new file mode 100644 index 0000000..bb5eaa6 --- /dev/null +++ b/codonPython/mesh/tests/test_check_inbox_count.py @@ -0,0 +1,41 @@ +import pytest + +import codonPython.mesh as mesh + + +def test_CheckInboxCount_ValidRequest_ReturnsJson(requests_mock, mesh_connection): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/count", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=200, + json={"count": 100}, + ) + test_check_inbox_count = mesh_connection.check_inbox_count() + assert test_check_inbox_count == 100 + assert requests_mock.call_count == 1 + + +def test_CheckInboxCount_403StatusCode_ReturnsAuthenticationError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/count", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.check_inbox_count() + assert requests_mock.call_count == 1 + + +def test_CheckInboxCount_400StatusCode_RaisesUnknownError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/count", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.check_inbox_count() + assert requests_mock.call_count == 1 diff --git a/codonPython/mesh/tests/test_check_message_status.py b/codonPython/mesh/tests/test_check_message_status.py new file mode 100644 index 0000000..001cfe7 --- /dev/null +++ b/codonPython/mesh/tests/test_check_message_status.py @@ -0,0 +1,105 @@ +import pytest +import codonPython.mesh as mesh + + +@pytest.fixture +def base_params(): + return { + "message_id": "1", + } + + +@pytest.fixture +def base_headers(): + return { + "Authorization": "xxxauthorizationxxx", + } + + +def test_CheckMessage_403_RaisesAuthenticationError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.check_message_status(**base_params) + + +def test_CheckMessage_404_RaisesMissingError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=404, + ) + with pytest.raises(mesh.MESHMessageMissing): + mesh_connection.check_message_status(**base_params) + + +def test_CheckMessage_400_RaisesUnknownError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.check_message_status(**base_params) + + +def test_CheckMessage_300_RaisesMultipleError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=300, + ) + with pytest.raises(mesh.MESHMultipleMatches): + mesh_connection.check_message_status(**base_params) + + +# Due to errors in the API, test for a 300 error sent with code 200 +def test_CheckMessage_Fake300_RaisesMultipleError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=200, + text="300: Multiple Choices300: Multiple Choices", + ) + with pytest.raises(mesh.MESHMultipleMatches): + mesh_connection.check_message_status(**base_params) + + +def test_CheckMessage_Valid_RequestsOnce( + mesh_connection, requests_mock, base_params, base_headers +): + resp = {"test": "true"} + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=200, + json=resp, + ) + mesh_connection.check_message_status(**base_params) + assert requests_mock.call_count == 1 + + +def test_CheckMessage_Valid_ReturnsJSON( + mesh_connection, requests_mock, base_params, base_headers +): + resp = {"test": "true"} + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/outbox/tracking/{base_params['message_id']}", + request_headers=base_headers, + status_code=200, + json=resp, + ) + assert mesh_connection.check_message_status(**base_params) == resp diff --git a/codonPython/mesh/tests/test_download_message.py b/codonPython/mesh/tests/test_download_message.py new file mode 100644 index 0000000..2fbed55 --- /dev/null +++ b/codonPython/mesh/tests/test_download_message.py @@ -0,0 +1,284 @@ +import pytest + +import codonPython.mesh as mesh + + +@pytest.fixture +def base_params(): + return { + "message_id": "1", + } + + +def test_DownloadMessage_SimpleFileReturnsCorrect( + requests_mock, mesh_connection, base_params, tmpdir +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}", + request_headers={ + "Authorization": "xxxauthorizationxxx", + "Accept-Encoding": "gzip", + }, + status_code=200, + headers={"Mex-FileName": "test.txt", "Mex-MessageType": "DATA"}, + text="test", + ) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test", + "headers": {"Mex-FileName": "test.txt", "Mex-MessageType": "DATA"}, + "datafile": True, + } + p = tmpdir.mkdir("save") + base_params["save_folder"] = str(p) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test", + "headers": {"Mex-FileName": "test.txt", "Mex-MessageType": "DATA"}, + "datafile": True, + } + assert p.join("test.txt").read() == "test" + + +def test_DownloadMessage_ZipFileReturnsCorrect( + requests_mock, mesh_connection, base_params, tmpdir +): + import gzip + + message = gzip.compress(b"test") + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}", + request_headers={ + "Authorization": "xxxauthorizationxxx", + "Accept-Encoding": "gzip", + }, + status_code=200, + headers={ + "Mex-FileName": "test.txt", + "Content-Encoding": "gzip", + "Mex-MessageType": "DATA", + }, + content=message, + ) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test", + "headers": { + "Mex-FileName": "test.txt", + "Content-Encoding": "gzip", + "Mex-MessageType": "DATA", + }, + "datafile": True, + } + p = tmpdir.mkdir("save") + base_params["save_folder"] = str(p) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test", + "headers": { + "Mex-FileName": "test.txt", + "Content-Encoding": "gzip", + "Mex-MessageType": "DATA", + }, + "datafile": True, + } + assert p.join("test.txt").read() == "test" + + +def test_DownloadMessage_NonDeliveryReturnsCorrect( + requests_mock, mesh_connection, base_params, tmpdir +): + headers = { + "Mex-Linkedmsgid": "1", + "Mex-MessageType": "REPORT", + } + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}", + request_headers={ + "Authorization": "xxxauthorizationxxx", + "Accept-Encoding": "gzip", + }, + status_code=200, + headers=headers, + ) + assert mesh_connection.download_message(**base_params) == { + "filename": None, + "contents": b'', + "headers": headers, + "datafile": False, + } + p = tmpdir.mkdir("save") + base_params["save_folder"] = str(p) + assert mesh_connection.download_message(**base_params) == { + "filename": None, + "contents": b'', + "headers": headers, + "datafile": False, + } + assert p.join( + "Non delivery report: 1.txt" + ).read() == "Message not delivered. All known details below\n" + str(headers) + + +def test_DownloadMessage_ChunkedFileReturnsCorrect( + requests_mock, mesh_connection, base_params, tmpdir +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}", + request_headers={ + "Authorization": "xxxauthorizationxxx", + "Accept-Encoding": "gzip", + }, + status_code=206, + headers={ + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + }, + text="test-", + ) + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/2", + status_code=206, + headers={"Mex-Chunk-Range": "2:3"}, + text="test2-", + ) + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/3", + status_code=200, + headers={"Mex-Chunk-Range": "3:3"}, + text="test3", + ) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test-test2-test3", + "headers": { + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + }, + "datafile": True, + } + p = tmpdir.mkdir("save") + base_params["save_folder"] = str(p) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test-test2-test3", + "headers": { + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + }, + "datafile": True, + } + assert p.join("test.txt").read() == "test-test2-test3" + + +def test_DownloadMessage_ChunkedZipFileReturnsCorrect( + requests_mock, mesh_connection, base_params, tmpdir +): + import gzip + from math import floor + + message = gzip.compress(b"test-test2-test3") + split = floor(len(message) / 3) + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}", + status_code=206, + headers={ + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + "Content-Encoding": "gzip", + }, + content=message[:split], + ) + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/2", + status_code=206, + headers={"Mex-Chunk-Range": "2:3"}, + content=message[split:split * 2], + ) + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/3", + status_code=200, + headers={"Mex-Chunk-Range": "3:3"}, + content=message[split * 2:], + ) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test-test2-test3", + "headers": { + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + "Content-Encoding": "gzip", + }, + "datafile": True, + } + p = tmpdir.mkdir("save") + base_params["save_folder"] = str(p) + assert mesh_connection.download_message(**base_params) == { + "filename": "test.txt", + "contents": b"test-test2-test3", + "headers": { + "Mex-FileName": "test.txt", + "Mex-MessageType": "DATA", + "Mex-Chunk-Range": "1:3", + "Content-Encoding": "gzip", + }, + "datafile": True, + } + assert p.join("test.txt").read() == "test-test2-test3" + + +def test_DownloadMessage_403StatusCode_ReturnsAuthenticationError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox/8", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.download_message(message_id=8, save_folder="save_folder") + assert requests_mock.call_count == 1 + + +def test_DownloadMessage_404StatusCode_ReturnsMessageDoesNotExistError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox/9", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=404, + ) + with pytest.raises(mesh.MESHMessageMissing): + mesh_connection.download_message(message_id=9, save_folder="save_folder") + assert requests_mock.call_count == 1 + + +def test_DownloadMessage_410StatusCode_ReturnsMessageAlreadyDownloadedError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox/10", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=410, + ) + with pytest.raises(mesh.MESHMessageAlreadyDownloaded): + mesh_connection.download_message(message_id=10, save_folder="save_folder") + assert requests_mock.call_count == 1 + + +def test_DownloadMessage_400StatusCode_RaisesUnknownError( + requests_mock, mesh_connection +): + requests_mock.get( + url="http://root/messageexchange/TestMailboxId/inbox/10", + request_headers={"Authorization": "xxxauthorizationxxx"}, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.download_message(message_id=10, save_folder="save_folder") + assert requests_mock.call_count == 1 diff --git a/codonPython/mesh/tests/test_download_message_chunk.py b/codonPython/mesh/tests/test_download_message_chunk.py new file mode 100644 index 0000000..81d9739 --- /dev/null +++ b/codonPython/mesh/tests/test_download_message_chunk.py @@ -0,0 +1,101 @@ +import pytest +import codonPython.mesh as mesh + + +@pytest.fixture +def base_params(): + return { + "message_id": "1", + "chunk_no": 2, + } + + +@pytest.fixture +def base_headers(): + return {"Authorization": "xxxauthorizationxxx", "Accept-Encoding": "gzip"} + + +def test_DownloadMessageChunk_403_RaisesAuthenticationError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection._download_message_chunk(**base_params) + + +def test_DownloadMessageChunk_404_RaisesMissingError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=404, + ) + with pytest.raises(mesh.MESHMessageMissing): + mesh_connection._download_message_chunk(**base_params) + + +def test_DownloadMessageChunk_410_RaisesGoneError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=410, + ) + with pytest.raises(mesh.MESHMessageAlreadyDownloaded): + mesh_connection._download_message_chunk(**base_params) + + +def test_DownloadMessageChunk_400_RaisesUnknownError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection._download_message_chunk(**base_params) + + +def test_DownloadMessageChunk_Valid_SentOnce( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=200, + text="test", + ) + mesh_connection._download_message_chunk(**base_params) + assert requests_mock.call_count == 1 + + +def test_DownloadMessageChunk_206_NoRaise( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=206, + text="test", + ) + mesh_connection._download_message_chunk(**base_params) + assert requests_mock.call_count == 1 + + +def test_DownloadMessageChunk_ReturnsCorrect( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.get( + url=f"http://root/messageexchange/TestMailboxId/inbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=200, + text="test", + ) + assert mesh_connection._download_message_chunk(**base_params) == b"test" diff --git a/codonPython/mesh/tests/test_generate_authorization.py b/codonPython/mesh/tests/test_generate_authorization.py new file mode 100644 index 0000000..a31d337 --- /dev/null +++ b/codonPython/mesh/tests/test_generate_authorization.py @@ -0,0 +1,48 @@ +import pytest +import re + +import codonPython.mesh as mesh + +mailbox = "(Test_Mailbox|)" +nonce = "[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}" +time = "[0-9]{12}" +hash_out = "[0-9a-z]{64}" +auth_regex = re.compile(f"NHSMESH {mailbox}:{nonce}:1:{time}:{hash_out}") + + +class Test_generate_authorization: + def test_generate_authorization(self): + mailbox = "Test_Mailbox" + password = "Secret_Password" + api_shared_key = "api_shared_key" + test_generate_authorization = mesh.generate_authorization( + mailbox, password, api_shared_key + ) + assert re.match(auth_regex, test_generate_authorization,) + + def test_generate_authorization_with_blank_mailbox(self): + mailbox = "" + password = "Secret_Password" + api_shared_key = "api_shared_key" + test_generate_authorization = mesh.generate_authorization( + mailbox, password, api_shared_key + ) + assert re.match(auth_regex, test_generate_authorization,) + + def test_generate_authorization_with_blank_password(self): + mailbox = "Test_Mailbox" + password = "" + api_shared_key = "api_shared_key" + test_generate_authorization = mesh.generate_authorization( + mailbox, password, api_shared_key + ) + assert re.match(auth_regex, test_generate_authorization,) + + def test_generate_authorization_with_blank_api_key(self): + mailbox = "Test_Mailbox" + password = "Secret_Password" + api_shared_key = "" + test_generate_authorization = mesh.generate_authorization( + mailbox, password, api_shared_key + ) + assert re.match(auth_regex, test_generate_authorization,) diff --git a/codonPython/mesh/tests/test_send_file.py b/codonPython/mesh/tests/test_send_file.py new file mode 100644 index 0000000..04e8b15 --- /dev/null +++ b/codonPython/mesh/tests/test_send_file.py @@ -0,0 +1,43 @@ +import pytest + + +@pytest.fixture +def make_params(tmpdir): + p = tmpdir.mkdir("folder").join("test.txt") + p.write("test") + params = { + "dest_mailbox": "TESTMB", + "message_location": str(p), + "workflow_id": "TESTWF", + "message_subject": "TESTSUB", + "message_id": "TESTID", + "process_id": "TESTPROC", + "compress_message": True, + "encrypted": True, + } + return params + + +def track_args(**kwargs): + return kwargs + + +@pytest.fixture +def patch_message(mesh_connection, monkeypatch): + monkeypatch.setattr(mesh_connection, "send_message", track_args) + return mesh_connection + + +def test_SendFile_HandlesParams(patch_message, make_params): + params = patch_message.send_file(**make_params) + assert params == { + "dest_mailbox": "TESTMB", + "message": b"test", + "filename": "test.txt", + "workflow_id": "TESTWF", + "message_subject": "TESTSUB", + "message_id": "TESTID", + "process_id": "TESTPROC", + "compress_message": True, + "encrypted": True, + } diff --git a/codonPython/mesh/tests/test_send_message.py b/codonPython/mesh/tests/test_send_message.py new file mode 100644 index 0000000..c1c5528 --- /dev/null +++ b/codonPython/mesh/tests/test_send_message.py @@ -0,0 +1,285 @@ +import pytest +import codonPython.mesh as mesh + + +@pytest.fixture +def base_params(): + return { + "dest_mailbox": "TESTMB", + "message": b"TEST", + "filename": "TEST.txt", + "workflow_id": "TESTWF", + } + + +@pytest.fixture +def base_headers(): + return { + "Authorization": "xxxauthorizationxxx", + "Content-Type": "application/octet-stream", + "Mex-From": "TestMailboxId", + "Mex-To": "TESTMB", + "Mex-WorkflowId": "TESTWF", + "Mex-FileName": "TEST.txt", + "Mex-MessageType": "DATA", + "Mex-Version": "1.0", + } + + +def test_SendMessage_403_RaisesAuthenticationError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessage_417_RaisesRecipientError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=417, + ) + with pytest.raises(mesh.MESHInvalidRecipient): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessage_400_RaisesUnknownError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessage_ValidHash( + mesh_connection, requests_mock, base_params, base_headers +): + import hashlib + + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + checksum = hashlib.md5(base_params["message"]).hexdigest() + assert requests_mock.call_count == 1 + assert requests_mock.request_history[0].headers["Mex-Checksum"] == f"md5 {checksum}" + + +def test_SendMessage_AbsentOptional_Skipped( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert not any( + header in requests_mock.request_history[0].headers + for header in [ + "Mex-ProcessID", + "Mex-LocalID", + "Mex-Subject", + "Mex-Content-Encrypted", + ] + ) + + +def test_SendMessage_PresentSubject_Included( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["message_subject"] = "TESTSUB" + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert requests_mock.request_history[0].headers["Mex-Subject"] == "TESTSUB" + + +def test_SendMessage_PresentMessageID_Included( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["message_id"] = "TESTMSG" + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert requests_mock.request_history[0].headers["Mex-LocalID"] == "TESTMSG" + + +def test_SendMessage_PresentProcess_Included( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["process_id"] = "TESTPROC" + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert requests_mock.request_history[0].headers["Mex-ProcessID"] == "TESTPROC" + + +def test_SendMessage_Encrypted_Included( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["encrypted"] = True + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert "Mex-Content-Encrypted" in requests_mock.request_history[0].headers + + +def test_compress_if_set(mesh_connection, requests_mock, base_params, base_headers): + import gzip + + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + expected_message = gzip.compress(base_params["message"]) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert "Mex-Content-Compressed" in requests_mock.request_history[0].headers + assert requests_mock.request_history[0].headers["Content-Encoding"] == "gzip" + assert requests_mock.request_history[0].body == expected_message + + +def test_no_compress_if_not_set( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["compress_message"] = False + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert "Mex-Content-Compressed" not in requests_mock.request_history[0].headers + assert "Content-Encoding" not in requests_mock.request_history[0].headers + assert requests_mock.request_history[0].body == base_params["message"] + + +class Tracker: + def __init__(self): + self.count = 0 + self.data = [] + + def inc(self, **kwargs): + self.count += 1 + self.data.append(kwargs) + + +def test_chunk_massive_file( + mesh_connection, requests_mock, base_params, base_headers, monkeypatch +): + chunks_sent = Tracker() + monkeypatch.setattr(mesh_connection, "_send_message_chunk", chunks_sent.inc) + base_params["compress_message"] = False + base_params["message"] = ("x" * 200000000).encode() + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=202, + json={"messageID": "1"}, + ) + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + assert requests_mock.request_history[0].headers["Mex-Chunk-Range"] == "1:3" + assert requests_mock.request_history[0].body == base_params["message"][0:80000000] + assert chunks_sent.count == 2 + assert ( + chunks_sent.data[0]["message_chunk"] + == base_params["message"][80000000:160000000] + ) + assert ( + chunks_sent.data[1]["message_chunk"] + == base_params["message"][160000000:240000000] + ) + assert chunks_sent.data[0]["message_id"] == "1" + assert chunks_sent.data[1]["message_id"] == "1" + assert chunks_sent.data[0]["chunk_no"] == 2 + assert chunks_sent.data[1]["chunk_no"] == 3 + + +def test_SendMessage_403_RaisesAuthenticationError_MassiveFile( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=403, + ) + base_params["message"] = ("x" * 200000000).encode() + base_params["compress_message"] = False + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessage_417_RaisesRecipientError_MassiveFile( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=417, + ) + base_params["message"] = ("x" * 200000000).encode() + base_params["compress_message"] = False + with pytest.raises(mesh.MESHInvalidRecipient): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessage_400_RaisesUnknownError_MassiveFile( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url="http://root/messageexchange/TestMailboxId/outbox", + request_headers=base_headers, + status_code=400, + ) + base_params["message"] = ("x" * 200000000).encode() + base_params["compress_message"] = False + with pytest.raises(mesh.MESHUnknownError): + mesh_connection.send_message(**base_params) + assert requests_mock.call_count == 1 diff --git a/codonPython/mesh/tests/test_send_message_chunk.py b/codonPython/mesh/tests/test_send_message_chunk.py new file mode 100644 index 0000000..b8a09ad --- /dev/null +++ b/codonPython/mesh/tests/test_send_message_chunk.py @@ -0,0 +1,83 @@ +import pytest +import codonPython.mesh as mesh + + +@pytest.fixture +def base_params(): + return { + "message_id": "1", + "message_chunk": b"TEST", + "chunk_no": 2, + "chunk_range": 3, + } + + +@pytest.fixture +def base_headers(): + return { + "Authorization": "xxxauthorizationxxx", + "Content-Type": "application/octet-stream", + "Mex-From": "TestMailboxId", + "Mex-Chunk-Range": "2:3", + } + + +def test_SendMessageChunk_403_RaisesAuthenticationError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url=f"http://root/messageexchange/TestMailboxId/outbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=403, + ) + with pytest.raises(mesh.MESHAuthenticationError): + mesh_connection._send_message_chunk(**base_params) + + +def test_SendMessageChunk_400_RaisesUnknownError( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url=f"http://root/messageexchange/TestMailboxId/outbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=400, + ) + with pytest.raises(mesh.MESHUnknownError): + mesh_connection._send_message_chunk(**base_params) + + +def test_SendMessageChunk_Valid_SentOnce( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url=f"http://root/messageexchange/TestMailboxId/outbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=202, + ) + mesh_connection._send_message_chunk(**base_params) + assert requests_mock.call_count == 1 + + +def test_SendMessageChunk_Compressed_CorrectHeaders( + mesh_connection, requests_mock, base_params, base_headers +): + requests_mock.post( + url=f"http://root/messageexchange/TestMailboxId/outbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=202, + ) + mesh_connection._send_message_chunk(**base_params) + assert requests_mock.request_history[0].headers["Content-Encoding"] == "gzip" + + +def test_SendMessageChunk_NotCompressed_CorrectHeaders( + mesh_connection, requests_mock, base_params, base_headers +): + base_params["compressed"] = False + requests_mock.post( + url=f"http://root/messageexchange/TestMailboxId/outbox/{base_params['message_id']}/{base_params['chunk_no']}", + request_headers=base_headers, + status_code=202, + ) + mesh_connection._send_message_chunk(**base_params) + assert "Content-Encoding" not in requests_mock.request_history[0].headers diff --git a/codonPython/nhsNumber.py b/codonPython/nhsNumber.py index 039eb80..d465a47 100644 --- a/codonPython/nhsNumber.py +++ b/codonPython/nhsNumber.py @@ -26,16 +26,13 @@ def nhsNumberValidator(number: int) -> bool: """ if not isinstance(number, int): - raise ValueError( - "Please input a positive 10 digit integer to validate.") + raise ValueError("Please input a positive 10 digit integer to validate.") if number < 0: - raise ValueError( - "Please input a postitive 10 digit integer to validate.") + raise ValueError("Please input a postitive 10 digit integer to validate.") digits = [int(digit) for digit in str(number)] # NHS Numbers are 10 digits long. if not len(digits) == 10: - raise ValueError( - "Please input a postitive 10 digit integer to validate.") + raise ValueError("Please input a postitive 10 digit integer to validate.") # Apply weighting to first 9 digits weighted_digits = np.dot(np.array(digits[:9]), np.arange(10, 1, -1)) # Validity is based on the check digit, which has to be equal to `remainder` @@ -51,7 +48,7 @@ def nhsNumberValidator(number: int) -> bool: def nhsNumberGenerator(to_generate: int, random_state: int = None) -> list: """ - Generates up to 1M random NHS numbers compliant with modulus 11 checks as recorded + Generates up to 1M random NHS numbers compliant with modulus 11 checks as recorded in the data dictonary. https://www.datadictionary.nhs.uk/data_dictionary/attributes/n/nhs/nhs_number_de.asp?shownav=1 @@ -76,13 +73,11 @@ def nhsNumberGenerator(to_generate: int, random_state: int = None) -> list: if random_state: random.seed(random_state) if not isinstance(to_generate, int): - raise ValueError( - "Please input a positive integer to generate numbers.") + raise ValueError("Please input a positive integer to generate numbers.") if to_generate > 1000000: raise ValueError("More than one million values requested") if to_generate < 0: - raise ValueError( - "Please input a postitive integer to generate numbers.") + raise ValueError("Please input a postitive integer to generate numbers.") generated = [] while len(generated) < to_generate: diff --git a/codonPython/nhsd_colours.py b/codonPython/nhsd_colours.py new file mode 100644 index 0000000..8258e79 --- /dev/null +++ b/codonPython/nhsd_colours.py @@ -0,0 +1,111 @@ +import seaborn as sns +import random + + +def nhsd_colours(): + """Returns a dictionary full of the different official NHSD colours from the + style guide: + https://digital.nhs.uk/about-nhs-digital/corporate-information-and-documents/nhs-digital-style-guidelines/how-we-look/colour-palette + + Parameters + ---------- + None + + Returns + -------- + colour_dict : dict (Python dictionary) + A dictionary containing sets of official NHS Digital branding colours + (Hexidecimal format) and fonts. + """ + + nhsd_chart_colours = ["#005EB8", "#71CCEF", "#84919C", "#003087", "#D0D5D6"] + nhsd_chart_background = {"chart_grey_3": "#F8F8F8", "white": "#FFFFFF"} + nhsd_core_colours = { + "white": "#ffffff", + "white_tints": ["#f9fafb", "#f3f5f6", "#edeff1", "#def2e5"], + "nhs_blue": "#005eb8", + "blue_tints": ["#337EC6", "#ACCAE8", "#D4E4F3", "#E6EFF8"], + "nhs_dark_grey": "#425563", + "grey_tints": [ + "#687784", + "#98A4AD", + "#B3BBC1", + "#DFE2E5", + "#EDEFF1", + "#F3F5F6", + "#F9FAFB", + ], + "nhs_mild_grey": "#768692", + "nhs_warm_yellow": "#FFB81C", + "warm_yellow_tints": ["#FFE8B4", "#FFF1CC", "#FFF8E8"], + } + nhsd_font = ["Frutiger Light", "Frutiger Roman"] + nhsd_font_backup = ["Arial"] + colour_dict = { + "chart": nhsd_chart_colours, + "chart_background": nhsd_chart_background, + "core": nhsd_core_colours, + "font": nhsd_font, + "font_backup": nhsd_font_backup, + } + return colour_dict + + +def nhsd_seaborn_style(): + """Sets the seaborn style to be inline with NHSD guidlines. This means your + graphs in Seaborn, or in Matplotlib will come out looking as per the NHSD + style guide. Simply run this function. + + Parameters + ---------- + None + + Returns + ---------- + None""" + nhs_colours = nhsd_colours() + chart_background = nhs_colours["chart_background"] + font_backup = nhs_colours["font_backup"] + chart_colours = nhs_colours["chart"] + + additional_colours = ( + nhsd_colours()["core"]["blue_tints"] + + nhsd_colours()["core"]["grey_tints"] + + nhsd_colours()["core"]["nhs_warm_yellow"] + + nhsd_colours()["core"]["warm_yellow_tints"] + ) + random.shuffle(additional_colours) + nhs_colours = chart_colours + additional_colours + + sns.set_palette(nhs_colours) + + seaborn_style_dict = { + "axes.axisbelow": True, + "axes.edgecolor": ".8", + "axes.facecolor": chart_background["chart_grey_3"], + "axes.grid": True, + "axes.labelcolor": ".15", + "axes.spines.bottom": False, # no spines + "axes.spines.left": False, # no spines + "axes.spines.right": False, # no spines + "axes.spines.top": False, # no spines + "figure.facecolor": chart_background["chart_grey_3"], + "font.family": ["sans-serif"], + "font.sans-serif": font_backup, + "grid.color": ".8", + "grid.linestyle": "-", + "image.cmap": "rocket", + "lines.solid_capstyle": "round", + "patch.edgecolor": "w", + "patch.force_edgecolor": True, + "text.color": ".15", + "xtick.bottom": False, + "xtick.color": ".15", + "xtick.direction": "out", + "xtick.top": False, + "ytick.color": ".15", + "ytick.direction": "out", + "ytick.left": False, + "ytick.right": False, + } + sns.set_style("whitegrid", seaborn_style_dict) diff --git a/codonPython/suppression.py b/codonPython/suppression.py index d09e703..15ec2ad 100644 --- a/codonPython/suppression.py +++ b/codonPython/suppression.py @@ -1,10 +1,9 @@ -def suppress_value(valuein: int, rc: str = '*', upper: int = 100000000)->str: +def central_suppression_method(valuein: int, rc: str = "5", upper: int = 5000000000) -> str: """ - Suppress values less than or equal to 7, round all non-national values. + Suppresses and rounds values using the central suppression method. - This function suppresses value if it is less than or equal to 7. If value is 0 then it will remain as 0. - If value is at national level it will remain unsuppressed. + If value is 1-7 it will be suppressed and appear as 5. All other values will be rounded to the nearest 5. Parameters @@ -14,20 +13,20 @@ def suppress_value(valuein: int, rc: str = '*', upper: int = 100000000)->str: rc : str Replacement character if value needs suppressing upper : int - Upper limit for suppression of numbers + Upper limit for suppression of numbers (5 billion) Returns ------- out : str - Suppressed value (*), 0 or valuein if greater than 7 or national + Suppressed value (5), 0 or rounded valuein if greater than 7 Examples -------- - >>> suppress_value(3) - '*' - >>> suppress_value(24) + >>> central_suppression_method(3) + '5' + >>> central_suppression_method(24) '25' - >>> suppress_value(0) + >>> central_suppression_method(0) '0' """ base = 5 @@ -42,8 +41,7 @@ def suppress_value(valuein: int, rc: str = '*', upper: int = 100000000)->str: elif valuein >= 1 and valuein <= 7: valueout = rc elif valuein > 7 and valuein <= upper: - valueout = str(base * round(valuein/base)) + valueout = str(base * round(valuein / base)) else: - raise ValueError( - "The input: {} is greater than: {}.".format(valuein, upper)) + raise ValueError("The input: {} is greater than: {}.".format(valuein, upper)) return valueout diff --git a/codonPython/tableFromSql.py b/codonPython/tableFromSql.py index e7544a4..01aa3d4 100644 --- a/codonPython/tableFromSql.py +++ b/codonPython/tableFromSql.py @@ -3,8 +3,20 @@ import pandas as pd -def tableFromSql(server: str, database: str, table_name: str, user: str = "", password: str = "", schema: str = None, index_col: str = None, coerce_float: bool = True, parse_dates: list = None, columns: list = None, chunksize: int = None): - ''' +def tableFromSql( + server: str, + database: str, + table_name: str, + user: str = "", + password: str = "", + schema: str = None, + index_col: str = None, + coerce_float: bool = True, + parse_dates: list = None, + columns: list = None, + chunksize: int = None, +): + """ Returns a SQL table in a DataFrame. Convert a table stored in SQL Server 2016 into a pandas dataframe. @@ -23,25 +35,25 @@ def tableFromSql(server: str, database: str, table_name: str, user: str = "", pa table_name : string Name of SQL table in database. schema : string, default : None - Name of SQL schema in database to query (if database flavor supports this). Uses + Name of SQL schema in database to query (if database flavor supports this). Uses default schema if None (default). index_col : string or list of strings, default : None Column(s) to set as index(MultiIndex). coerce_float : boolean, default : True - Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) + Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point. Can result in loss of Precision. parse_dates : list or dict, default : None - List of column names to parse as dates. - - Dict of {column_name: format string} where format string is strftime compatible in - case of parsing string times or is one of (D, s, ns, ms, us) in case of parsing + - Dict of {column_name: format string} where format string is strftime compatible in + case of parsing string times or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - - Dict of {column_name: arg dict}, where the arg dict corresponds to the keyword - arguments of pandas.to_datetime() Especially useful with databases without native + - Dict of {column_name: arg dict}, where the arg dict corresponds to the keyword + arguments of pandas.to_datetime() Especially useful with databases without native Datetime support, such as SQLite. columns : list, default : None List of column names to select from SQL table chunksize : int, default : None - If specified, returns an iterator where chunksize is the number of rows to include + If specified, returns an iterator where chunksize is the number of rows to include in each chunk. Returns @@ -55,12 +67,22 @@ def tableFromSql(server: str, database: str, table_name: str, user: str = "", pa # pd.DataFrame # >>> tableFromSql("myServer", "myDatabase", "myTable", schema="specialSchema", columns=["col_1", "col_3"]) # pd.DataFrame - ''' + """ try: uri = "mssql+pyodbc://{}:{}@{}/{}?driver=SQL Server Native Client 11.0".format( - user, password, server, database) + user, password, server, database + ) engine = create_engine(uri) - return pd.read_sql_table(table_name, engine, schema=schema, index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, columns=columns, chunksize=chunksize) + return pd.read_sql_table( + table_name, + engine, + schema=schema, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) except Exception as error: raise error diff --git a/codonPython/tests/ODS_test.py b/codonPython/tests/ODS_test.py new file mode 100644 index 0000000..ba05796 --- /dev/null +++ b/codonPython/tests/ODS_test.py @@ -0,0 +1,27 @@ +import pytest +import numpy as np +from codonPython import ODS_lookup + + +def test_successful_query(): + NHSD_code = "X26" + result = ODS_lookup.query_api(NHSD_code) + assert result["Organisation"]["Name"] == "NHS DIGITAL" + + +def test_unsuccessful_query(): + invalid_code = "ASDF" + with pytest.raises(ValueError): + ODS_lookup.query_api(invalid_code) + + +def test_wrong_type(): + invalid_code = 0 + with pytest.raises(ValueError): + ODS_lookup.query_api(invalid_code) + + +def test_unsuccessful_address_query(): + invalid_code = ["ASDF", np.nan, None] + result = ODS_lookup.get_addresses(invalid_code) + assert result.empty diff --git a/codonPython/tests/SQL_connections_test.py b/codonPython/tests/SQL_connections_test.py new file mode 100644 index 0000000..9ad8fe1 --- /dev/null +++ b/codonPython/tests/SQL_connections_test.py @@ -0,0 +1,15 @@ +'''test script for SQL_connections +- test the connections can run a dummy script (SELECT 1 as [Code], 'test' as [Name])''' +import pandas as pd +import pytest +import codonPython.SQL_connections as conn + + +@pytest.mark.parametrize("connection", + [conn.conn_dummy(), + conn.conn_dummy('test.db') + ]) +def test_select1(connection): + result = pd.read_sql("""SELECT 1 as [Code], 'Test' as [Name]""", connection).iloc[0, 0] + expected = pd.DataFrame([{'Code': 1, 'Name': 'Test'}]).iloc[0, 0] + assert result == expected diff --git a/codonPython/tests/age_bands_test.py b/codonPython/tests/age_bands_test.py index e097fe7..f0200c5 100644 --- a/codonPython/tests/age_bands_test.py +++ b/codonPython/tests/age_bands_test.py @@ -4,19 +4,22 @@ import pytest -@pytest.mark.parametrize("age, expected", [ - (0, '0-4'), - (1, '0-4'), - (12, '10-14'), - (23, '20-24'), - (34, '30-34'), - (35, '35-39'), - (46, '45-49'), - (57, '55-59'), - (68, '65-69'), - (79, '75-79'), - (90, '90 and over'), -]) +@pytest.mark.parametrize( + "age, expected", + [ + (0, "0-4"), + (1, "0-4"), + (12, "10-14"), + (23, "20-24"), + (34, "30-34"), + (35, "35-39"), + (46, "45-49"), + (57, "55-59"), + (68, "65-69"), + (79, "75-79"), + (90, "90 and over"), + ], +) def test_age_band_5_years_BAU(age, expected): assert expected == age_bands.age_band_5_years(age) @@ -26,55 +29,53 @@ def test_age_band_5_years_typeErrors(): age_bands.age_band_5_years("age") -@pytest.mark.parametrize("age", [ - np.nan, - math.inf, - -3, - 343, - -0.1 -]) +@pytest.mark.parametrize("age", [np.nan, math.inf, -3, 343, -0.1]) def test_age_band_5_years_valueErrors(age): with pytest.raises(ValueError): age_bands.age_band_5_years(age) -@pytest.mark.parametrize("age, expected", [ - (None, 'Age not known'), -]) +@pytest.mark.parametrize("age, expected", [(None, "Age not known")]) def test_age_band_5_years_edgeCases(age, expected): assert expected == age_bands.age_band_5_years(age) -@pytest.mark.parametrize("age, expected", [ - (0.1, '0-4'), - (1.2, '0-4'), - (12.3, '10-14'), - (23.4, '20-24'), - (34.5, '30-34'), - (35.6, '35-39'), - (46.7, '45-49'), - (57.8, '55-59'), - (68.9, '65-69'), - (79.0, '75-79'), - (90.1, '90 and over'), -]) +@pytest.mark.parametrize( + "age, expected", + [ + (0.1, "0-4"), + (1.2, "0-4"), + (12.3, "10-14"), + (23.4, "20-24"), + (34.5, "30-34"), + (35.6, "35-39"), + (46.7, "45-49"), + (57.8, "55-59"), + (68.9, "65-69"), + (79.0, "75-79"), + (90.1, "90 and over"), + ], +) def test_age_band_5_years_BAU_floats(age, expected): assert expected == age_bands.age_band_5_years(age) -@pytest.mark.parametrize("age, expected", [ - (0, '0-9'), - (1, '0-9'), - (12, '10-19'), - (23, '20-29'), - (34, '30-39'), - (35, '30-39'), - (46, '40-49'), - (57, '50-59'), - (68, '60-69'), - (79, '70-79'), - (90, '90 and over'), -]) +@pytest.mark.parametrize( + "age, expected", + [ + (0, "0-9"), + (1, "0-9"), + (12, "10-19"), + (23, "20-29"), + (34, "30-39"), + (35, "30-39"), + (46, "40-49"), + (57, "50-59"), + (68, "60-69"), + (79, "70-79"), + (90, "90 and over"), + ], +) def test_age_band_10_years_BAU(age, expected): assert expected == age_bands.age_band_10_years(age) @@ -84,37 +85,32 @@ def test_age_band_10_years_typeErrors(): age_bands.age_band_10_years("age") -@pytest.mark.parametrize("age", [ - np.nan, - math.inf, - -3, - 343, - -0.1 -]) +@pytest.mark.parametrize("age", [np.nan, math.inf, -3, 343, -0.1]) def test_age_band_10_years_valueErrors(age): with pytest.raises(ValueError): age_bands.age_band_10_years(age) -@pytest.mark.parametrize("age, expected", [ - (None, 'Age not known'), -]) +@pytest.mark.parametrize("age, expected", [(None, "Age not known")]) def test_age_band_10_years_edgeCases(age, expected): assert expected == age_bands.age_band_10_years(age) -@pytest.mark.parametrize("age, expected", [ - (0.1, '0-9'), - (1.2, '0-9'), - (12.3, '10-19'), - (23.4, '20-29'), - (34.5, '30-39'), - (35.6, '30-39'), - (46.7, '40-49'), - (57.8, '50-59'), - (68.9, '60-69'), - (79.0, '70-79'), - (90.1, '90 and over'), -]) +@pytest.mark.parametrize( + "age, expected", + [ + (0.1, "0-9"), + (1.2, "0-9"), + (12.3, "10-19"), + (23.4, "20-29"), + (34.5, "30-39"), + (35.6, "30-39"), + (46.7, "40-49"), + (57.8, "50-59"), + (68.9, "60-69"), + (79.0, "70-79"), + (90.1, "90 and over"), + ], +) def test_age_band_10_years_BAU_floats(age, expected): assert expected == age_bands.age_band_10_years(age) diff --git a/codonPython/tests/check_consistent_measures_test.py b/codonPython/tests/check_consistent_measures_test.py index 5289fd8..0ab7e2c 100644 --- a/codonPython/tests/check_consistent_measures_test.py +++ b/codonPython/tests/check_consistent_measures_test.py @@ -1,68 +1,115 @@ -from codonPython.check_consistent_measures import check_consistent_measures +from codonPython.validation.check_consistent_measures import check_consistent_measures import pandas as pd import numpy as np import pytest -@pytest.mark.parametrize("data, geography_col, measure_col, measures_set, expected", [ - ( - pd.DataFrame({ - "Geog" : ["National" ,"National", "Region", "Region", "Local", "Local"], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2"], - "Value_Unsuppressed" : [4, 2, 2, 1, 2, 1], - }), - "Geog", - "measure", - set({"m1", "m2"}), - True - ), - ( - pd.DataFrame({ - "Geog" : ["National" ,"National", "Region", "Region", "Local", "Local"], - "measure" : ["m1", "m2", "m1", "m3", "m1", "m2"], - "Value_Unsuppressed" : [4, 2, 2, 1, 2, 1], - }), - "Geog", - "measure", - set({"m1", "m2"}), - False - ) -]) +@pytest.mark.parametrize( + "data, geography_col, measure_col, measures_set, expected", + [ + ( + pd.DataFrame( + { + "Geog": [ + "National", + "National", + "Region", + "Region", + "Local", + "Local", + ], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "Value_Unsuppressed": [4, 2, 2, 1, 2, 1], + } + ), + "Geog", + "measure", + set({"m1", "m2"}), + True, + ), + ( + pd.DataFrame( + { + "Geog": [ + "National", + "National", + "Region", + "Region", + "Local", + "Local", + ], + "measure": ["m1", "m2", "m1", "m3", "m1", "m2"], + "Value_Unsuppressed": [4, 2, 2, 1, 2, 1], + } + ), + "Geog", + "measure", + set({"m1", "m2"}), + False, + ), + ], +) def test_each_org_levels_BAU(data, geography_col, measure_col, measures_set, expected): - assert expected == check_consistent_measures(data, geography_col, measure_col, measures_set) - - -@pytest.mark.parametrize("data, geography_col, measure_col, measures_set", [ - ( - pd.DataFrame({ - "Geog" : ["National" ,"National", "Region", "Region", "Local", "Local"], - "measure" : ["m1", "m2", "m1", np.nan, "m1", "m2"], - "Value_Unsuppressed" : [4, 2, 2, 1, 2, 1], - }), - "Geog", - "measure", - set({"m1", "m2"}), - ), -]) + assert expected == check_consistent_measures( + data, geography_col, measure_col, measures_set + ) -def test_each_org_levels_valueErrors_measure_col(data, geography_col, measure_col, measures_set): +@pytest.mark.parametrize( + "data, geography_col, measure_col, measures_set", + [ + ( + pd.DataFrame( + { + "Geog": [ + "National", + "National", + "Region", + "Region", + "Local", + "Local", + ], + "measure": ["m1", "m2", "m1", np.nan, "m1", "m2"], + "Value_Unsuppressed": [4, 2, 2, 1, 2, 1], + } + ), + "Geog", + "measure", + set({"m1", "m2"}), + ) + ], +) +def test_each_org_levels_valueErrors_measure_col( + data, geography_col, measure_col, measures_set +): with pytest.raises(ValueError): check_consistent_measures(data, geography_col, measure_col, measures_set) -@pytest.mark.parametrize("data, geography_col, measure_col, measures_set", [ - ( - pd.DataFrame({ - "Geog" : ["National" ,"National", "Region", "Region", "Local", "Local"], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2"], - "Value_Unsuppressed" : [4, 2, 2, 1, 2, 1], - }), - "Global", - "measure", - set({"m1", "m2"}), - ) -]) +@pytest.mark.parametrize( + "data, geography_col, measure_col, measures_set", + [ + ( + pd.DataFrame( + { + "Geog": [ + "National", + "National", + "Region", + "Region", + "Local", + "Local", + ], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "Value_Unsuppressed": [4, 2, 2, 1, 2, 1], + } + ), + "Global", + "measure", + set({"m1", "m2"}), + ) + ], +) def test_each_geography_col_keyError(data, geography_col, measure_col, measures_set): with pytest.raises(KeyError): - check_consistent_measures(data, geography_col, measure_col, measures_set) \ No newline at end of file + check_consistent_measures(data, geography_col, measure_col, measures_set) diff --git a/codonPython/tests/check_consistent_submissions_test.py b/codonPython/tests/check_consistent_submissions_test.py index f8ea005..49c30d5 100644 --- a/codonPython/tests/check_consistent_submissions_test.py +++ b/codonPython/tests/check_consistent_submissions_test.py @@ -1,92 +1,132 @@ -from codonPython.check_consistent_submissions import check_consistent_submissions +from codonPython.validation.check_consistent_submissions import check_consistent_submissions import pandas as pd import numpy as np import pytest -@pytest.mark.parametrize("data, national_geog_level, geography_col, submissions_col, measure_col, expected", [ - ( - pd.DataFrame({ - "Geog" : ["N" ,"N", "Region", "Region", "Local", "Local",], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2",], - "submissions" : [4, 2, 2, 1, 2, 1,], - }), - "N", - "Geog", - "submissions", - "measure", - True - ), - ( - pd.DataFrame({ - "Org_Level" : ["National" ,"National", "Region", "Region", "Local", "Local",], - "Measure" : ["m1", "m2", "m1", "m2", "m1", "m2",], - "Value_Unsuppressed" : [4, 2, 3, 1, 2, 1,], - }), - "National", - "Org_Level", - "Value_Unsuppressed", - "Measure", - False - ) -]) - -def test_each_consistent_measure_BAU(data, national_geog_level, geography_col, submissions_col, measure_col, expected): - assert expected == check_consistent_submissions(data, national_geog_level, geography_col, submissions_col, measure_col) -@pytest.mark.parametrize("data, national_geog_level, geography_col, submissions_col, measure_col",[ - ( - pd.DataFrame({ - "Geog" : ["N" ,"N", "Region", "Region", "Local", "Local",], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2",], - "submissions" : [4, 2, 2, 1, 2, 1,], - }), - 1, - "Geog", - "submissions", - "measure" - ), - ( - pd.DataFrame({ - "Geog" : ["N", "N", "Region", "Region", "Local", "Local",], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2",], - "submissions" : [4, 2, 2, 1, 2, 1,], - }), - "N", - False, - "submissions", - "measure" - ), - ( - pd.DataFrame({ - "Geog" : ["N", "N", "Region", "Region", "Local", "Local",], - "measure" : ["m1", "m2", "m2", "m2", "m1", "m2",], - "submissions" : [4, 2, 2, 1, 2, 1,], - }), - "N", - "Geog", - 4.2, - "measure" +@pytest.mark.parametrize( + "data, national_geog_level, geography_col, submissions_col, measure_col, expected", + [ + ( + pd.DataFrame( + { + "Geog": ["N", "N", "Region", "Region", "Local", "Local"], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "submissions": [4, 2, 2, 1, 2, 1], + } + ), + "N", + "Geog", + "submissions", + "measure", + True, + ), + ( + pd.DataFrame( + { + "Org_Level": [ + "National", + "National", + "Region", + "Region", + "Local", + "Local", + ], + "Measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "Value_Unsuppressed": [4, 2, 3, 1, 2, 1], + } + ), + "National", + "Org_Level", + "Value_Unsuppressed", + "Measure", + False, + ), + ], +) +def test_each_consistent_measure_BAU( + data, national_geog_level, geography_col, submissions_col, measure_col, expected +): + assert expected == check_consistent_submissions( + data, national_geog_level, geography_col, submissions_col, measure_col ) -]) -def test_each_consistent_submissions_valueErrors(data, national_geog_level, geography_col, submissions_col, measure_col): + +@pytest.mark.parametrize( + "data, national_geog_level, geography_col, submissions_col, measure_col", + [ + ( + pd.DataFrame( + { + "Geog": ["N", "N", "Region", "Region", "Local", "Local"], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "submissions": [4, 2, 2, 1, 2, 1], + } + ), + 1, + "Geog", + "submissions", + "measure", + ), + ( + pd.DataFrame( + { + "Geog": ["N", "N", "Region", "Region", "Local", "Local"], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "submissions": [4, 2, 2, 1, 2, 1], + } + ), + "N", + False, + "submissions", + "measure", + ), + ( + pd.DataFrame( + { + "Geog": ["N", "N", "Region", "Region", "Local", "Local"], + "measure": ["m1", "m2", "m2", "m2", "m1", "m2"], + "submissions": [4, 2, 2, 1, 2, 1], + } + ), + "N", + "Geog", + 4.2, + "measure", + ), + ], +) +def test_each_consistent_submissions_valueErrors( + data, national_geog_level, geography_col, submissions_col, measure_col +): with pytest.raises(ValueError): - check_consistent_submissions(data, national_geog_level, geography_col, submissions_col, measure_col) + check_consistent_submissions( + data, national_geog_level, geography_col, submissions_col, measure_col + ) -@pytest.mark.parametrize("data, national_geog_level, geography_col, submissions_col, measure_col", [ - ( - pd.DataFrame({ - "Geog" : ["N" ,"N", "Region", "Region", "Local", "Local",], - "measure" : ["m1", "m2", "m1", "m2", "m1", "m2",], - "submissions" : [4, 2, 2, 1, 2, 1,], - }), - "N", - "Geog", - "submissions", - "measurez" - ) -]) -def test_each_consistent_submissions_colError(data, national_geog_level, geography_col, submissions_col, measure_col): +@pytest.mark.parametrize( + "data, national_geog_level, geography_col, submissions_col, measure_col", + [ + ( + pd.DataFrame( + { + "Geog": ["N", "N", "Region", "Region", "Local", "Local"], + "measure": ["m1", "m2", "m1", "m2", "m1", "m2"], + "submissions": [4, 2, 2, 1, 2, 1], + } + ), + "N", + "Geog", + "submissions", + "measurez", + ) + ], +) +def test_each_consistent_submissions_colError( + data, national_geog_level, geography_col, submissions_col, measure_col +): with pytest.raises(KeyError): - check_consistent_submissions(data, national_geog_level, geography_col, submissions_col, measure_col) + check_consistent_submissions( + data, national_geog_level, geography_col, submissions_col, measure_col + ) diff --git a/codonPython/tests/check_nat_val_test.py b/codonPython/tests/check_nat_val_test.py index f019d0f..2e311ba 100644 --- a/codonPython/tests/check_nat_val_test.py +++ b/codonPython/tests/check_nat_val_test.py @@ -1,76 +1,87 @@ -from codonPython.check_nat_val import check_nat_val +from codonPython.validation.check_nat_val import check_nat_val import pytest import pandas as pd -df = pd.DataFrame({ - "Breakdown" : [ - 'National', 'CCG', 'CCG', 'Provider', 'Provider', - 'National' ,'CCG', 'CCG', 'Provider', 'Provider', - 'National' ,'CCG', 'CCG', 'Provider', 'Provider', - ], - "measure" : [ - 'm1', 'm1', 'm1', 'm1', 'm1', - 'm2', 'm2', 'm2', 'm2', 'm2', - 'm3', 'm3', 'm3', 'm3', 'm3', - ], - "Value_Unsuppressed" : [ - 9, 4, 5, 3, 6, - 11, 2, 9, 7, 4, - 9, 5, 4, 6, 3 - ], -}) +df = pd.DataFrame( + { + "Breakdown": [ + "National", + "CCG", + "CCG", + "Provider", + "Provider", + "National", + "CCG", + "CCG", + "Provider", + "Provider", + "National", + "CCG", + "CCG", + "Provider", + "Provider", + ], + "measure": [ + "m1", + "m1", + "m1", + "m1", + "m1", + "m2", + "m2", + "m2", + "m2", + "m2", + "m3", + "m3", + "m3", + "m3", + "m3", + ], + "Value_Unsuppressed": [9, 4, 5, 3, 6, 11, 2, 9, 7, 4, 9, 5, 4, 6, 3], + } +) -@pytest.mark.parametrize("df, breakdown_col, measure_col, value_col, nat_val, expected", [ - ( - df, - "Breakdown", - "measure", - "Value_Unsuppressed", - "National", - True - ), -]) + +@pytest.mark.parametrize( + "df, breakdown_col, measure_col, value_col, nat_val, expected", + [(df, "Breakdown", "measure", "Value_Unsuppressed", "National", True)], +) def test_BAU(df, breakdown_col, measure_col, value_col, nat_val, expected): - assert check_nat_val( - df, - breakdown_col=breakdown_col, - measure_col=measure_col, - value_col=value_col, - nat_val=nat_val, - ) == expected + assert ( + check_nat_val( + df, + breakdown_col=breakdown_col, + measure_col=measure_col, + value_col=value_col, + nat_val=nat_val, + ) + == expected + ) -@pytest.mark.parametrize("df, breakdown_col, measure_col, value_col, nat_val", [ - ( - df, - "Breakdown", - 23, # Not a string - "Value_Unsuppressed", - "National", - ), - ( - df, - 0.1, # Not a string - "Measure", - "Value_Unsuppressed", - "National", - ), - ( - df, - "Breakdown", - "Measure", - pd.DataFrame({"wrong" : [1, 2, 3]}), # Not a string - "National", - ), - ( - df, - "Breakdown", - "Measure", - "Value_Unsuppressed", - set({"m1", "m2"}), # Not a string - ), -]) +@pytest.mark.parametrize( + "df, breakdown_col, measure_col, value_col, nat_val", + [ + (df, "Breakdown", 23, "Value_Unsuppressed", "National"), # Not a string + (df, 0.1, "Measure", "Value_Unsuppressed", "National"), # Not a string + ( + df, + "Breakdown", + "Measure", + pd.DataFrame({"wrong": [1, 2, 3]}), # Not a string + "National", + ), + ( + df, + "Breakdown", + "Measure", + "Value_Unsuppressed", + set({"m1", "m2"}), # Not a string + ), + ], +) def test_ValueErrors(df, breakdown_col, measure_col, value_col, nat_val): with pytest.raises(ValueError): check_nat_val( @@ -82,15 +93,10 @@ def test_ValueErrors(df, breakdown_col, measure_col, value_col, nat_val): ) -@pytest.mark.parametrize("df, breakdown_col, measure_col, value_col, nat_val", [ - ( - df, - "Breakdown", - "measure", - "Wrong_Column", - "National", - ) -]) +@pytest.mark.parametrize( + "df, breakdown_col, measure_col, value_col, nat_val", + [(df, "Breakdown", "measure", "Wrong_Column", "National")], +) def test_KeyErrors(df, breakdown_col, measure_col, value_col, nat_val): with pytest.raises(KeyError): check_nat_val( @@ -99,4 +105,4 @@ def test_KeyErrors(df, breakdown_col, measure_col, value_col, nat_val): measure_col=measure_col, value_col=value_col, nat_val=nat_val, - ) \ No newline at end of file + ) diff --git a/codonPython/tests/check_null_test.py b/codonPython/tests/check_null_test.py index f2facc2..1a64e33 100644 --- a/codonPython/tests/check_null_test.py +++ b/codonPython/tests/check_null_test.py @@ -1,32 +1,33 @@ -from codonPython.check_null import check_null +from codonPython.validation.check_null import check_null import numpy as np import pandas as pd import pytest -testdata = pd.DataFrame({ - "col1" : [1,2,3,4,5,6,7,8,9,10], - "col2" : [11,12,13,14,15,np.nan,np.nan,18,19,20], -}) +testdata = pd.DataFrame( + { + "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "col2": [11, 12, 13, 14, 15, np.nan, np.nan, 18, 19, 20], + } +) -@pytest.mark.parametrize("dataframe, columns_to_be_checked, expected", [ - (testdata.iloc[:5, :], ["col1", "col2"], 0), - (testdata, ["col2"], 2), -]) + +@pytest.mark.parametrize( + "dataframe, columns_to_be_checked, expected", + [(testdata.iloc[:5, :], ["col1", "col2"], 0), (testdata, ["col2"], 2)], +) def test_BAU(dataframe, columns_to_be_checked, expected): assert check_null(dataframe, columns_to_be_checked) == expected -@pytest.mark.parametrize("dataframe, columns_to_be_checked", [ - (testdata, 0.01), -]) +@pytest.mark.parametrize("dataframe, columns_to_be_checked", [(testdata, 0.01)]) def test_ValueError(dataframe, columns_to_be_checked): with pytest.raises(ValueError): check_null(dataframe, columns_to_be_checked) -@pytest.mark.parametrize("dataframe, columns_to_be_checked", [ - (testdata, ["wrong_column"]), -]) +@pytest.mark.parametrize( + "dataframe, columns_to_be_checked", [(testdata, ["wrong_column"])] +) def test_KeyError(dataframe, columns_to_be_checked): with pytest.raises(KeyError): - check_null(dataframe, columns_to_be_checked) \ No newline at end of file + check_null(dataframe, columns_to_be_checked) diff --git a/codonPython/tests/dateValidator_test.py b/codonPython/tests/dateValidator_test.py index d527e6a..44a0a1e 100644 --- a/codonPython/tests/dateValidator_test.py +++ b/codonPython/tests/dateValidator_test.py @@ -1,26 +1,30 @@ -from codonPython import dateValidator -import numpy as np -import math +from codonPython.validation import dateValidator import pytest -@pytest.mark.parametrize("date_string, expected", [ - ('01/01/1900', True), # Edge date - ('29/02/1992', True), # Leap Year - ('31/05/2020', True), # 31-day month - ('29/02/2040', True), # Leap Year - ('31/12/2049', True), # Edge date -]) +@pytest.mark.parametrize( + "date_string, expected", + [ + ("01/01/1900", True), # Edge date + ("29/02/1992", True), # Leap Year + ("31/05/2020", True), # 31-day month + ("29/02/2040", True), # Leap Year + ("31/12/2049", True), # Edge date + ], +) def test_validDate_positives(date_string, expected): assert expected == dateValidator.validDate(date_string) -@pytest.mark.parametrize("date_string, expected", [ - ('31/12/1899', False), # Edge date - ('29/02/1990', False), # Leap Year - ('31/04/2020', False), # 31-day month - ('29/02/2041', False), # Leap Year - ('01/01/2050', False), # Edge date -]) +@pytest.mark.parametrize( + "date_string, expected", + [ + ("31/12/1899", False), # Edge date + ("29/02/1990", False), # Leap Year + ("31/04/2020", False), # 31-day month + ("29/02/2041", False), # Leap Year + ("01/01/2050", False), # Edge date + ], +) def test_validDate_negatives(date_string, expected): assert expected == dateValidator.validDate(date_string) diff --git a/codonPython/tests/file_utils_test.py b/codonPython/tests/file_utils_test.py new file mode 100644 index 0000000..12effb2 --- /dev/null +++ b/codonPython/tests/file_utils_test.py @@ -0,0 +1,311 @@ +from codonPython.file_utils import compare +from codonPython.file_utils import file_search +from codonPython.file_utils import import_files +import numpy as np +import pytest +import pandas as pd + +df1 = pd.DataFrame( + { + "A": [1, 5, 6, 1, 8, 5, 9], + "B": [2, 8, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 5, 9], + "D": [2, 8, 5, 2, 4, 6, 2], + "E": [1, 2, 6, 1, 3, 5, 5], + } +) + +df2 = pd.DataFrame( + { + "A": [1, 5, 6, 1, 9, 5, 9], + "B": [2, 9, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 35, 9], + "D": [2, 8, 7, 2, 4, 6, 2], + "E": [1, 2, 46, 1, 3, 8, 5], + } +) + +dict_test = { + "same_values": pd.DataFrame( + np.array([[1, 2, 3, 2, 1], [9, 5, 9, 2, 5]]), columns=["A", "B", "C", "D", "E"] + ), + "df1_not_df2": pd.DataFrame( + np.array([[5, 8, 4, 8, 2], [6, 5, 5, 5, 6], [8, 21, 1, 4, 3], [5, 3, 5, 6, 5]]), + columns=["A", "B", "C", "D", "E"], + ), + "df2_not_df1": pd.DataFrame( + np.array( + [[5, 9, 4, 8, 2], [6, 5, 5, 7, 46], [9, 21, 1, 4, 3], [5, 3, 35, 6, 8]] + ), + columns=["A", "B", "C", "D", "E"], + ), + "df1_dups": pd.DataFrame( + np.array([[1, 2, 3, 2, 1]]), columns=["A", "B", "C", "D", "E"] + ), + "df2_dups": pd.DataFrame( + np.array([[1, 2, 3, 2, 1]]), columns=["A", "B", "C", "D", "E"] + ), + "Same": False, +} + + +@pytest.mark.parametrize( + "x, y, names, dups, same, expected", + [ + ( + pd.DataFrame( + { + "A": [1, 5, 6, 1, 8, 5, 9], + "B": [2, 8, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 5, 9], + "D": [2, 8, 5, 2, 4, 6, 2], + "E": [1, 2, 6, 1, 3, 5, 5], + } + ), + pd.DataFrame( + { + "A": [1, 5, 6, 1, 9, 5, 9], + "B": [2, 9, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 35, 9], + "D": [2, 8, 7, 2, 4, 6, 2], + "E": [1, 2, 46, 1, 3, 8, 5], + } + ), + ["df1", "df2"], + True, + True, + dict_test, + ) + ], +) +def test_compare_BAU(x, y, names, dups, same, expected): + dict_test1 = compare(x, y, names=["df1", "df2"], dups=True, same=True) + for i in expected.keys(): + if i == "Same": + assert dict_test1[i] == expected[i] + else: + for j in expected[i]: + list_test1 = list(dict_test1[i][j]) + list_exp = list(expected[i][j]) + assert list_test1 == list_exp + + +@pytest.mark.parametrize( + "doctype, like, strict, expected", [("md", ["README"], True, ["README.md"])] +) +def test_file_search_BAU(doctype, like, strict, expected): + assert file_search(doctype=doctype, like=like, strict=strict) == expected + + +@pytest.mark.parametrize("expected", [({})]) +def test_import_files_BAU(expected): + assert import_files() == expected + + +@pytest.mark.parametrize("subdir, expected", [(True, {})]) +def test_import_files_BAU_2(subdir, expected): + assert import_files(subdir=subdir) == expected + + +@pytest.mark.parametrize("strict,subdir, expected", [(True, True, {})]) +def test_import_files_BAU_3(strict, subdir, expected): + assert import_files(strict=strict, subdir=subdir) == expected + + +# ----------------Console output------------------------- + + +@pytest.mark.parametrize( + "x, y, names, dups, same, comment", + [ + ( + pd.DataFrame( + { + "A": [1, 5, 6, 1, 8, 5, 9], + "B": [2, 8, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 5, 9], + "D": [2, 8, 5, 2, 4, 6, 2], + "E": [1, 2, 6, 1, 3, 5, 5], + } + ), + pd.DataFrame( + { + "A": [1, 5, 6, 1, 9, 5, 9], + "B": [2, 9, 5, 2, 21, 3, 5], + "C": [3, 4, 5, 3, 1, 35, 9], + "D": [2, 8, 7, 2, 4, 6, 2], + "E": [1, 2, 46, 1, 3, 8, 5], + } + ), + ["df1", "df2"], + True, + True, + True, + ) + ], +) +def test_compare_console(x, y, names, dups, same, comment, capsys): + dict_test1 = compare( + x, y, names=["df1", "df2"], dups=True, same=True, comment=comment + ) + captured = capsys.readouterr() + assert ( + captured.out + == "\nThere are " + + str(dict_test1["same_values"].shape[0]) + + " same values\nThere are " + + str(dict_test1[names[0] + "_not_" + names[1]].shape[0]) + + " outliers in " + + str(names[0]) + + "\nThere are " + + str(dict_test1[names[1] + "_not_" + names[0]].shape[0]) + + " outliers in " + + str(names[1]) + + "\nThere are " + + str(dict_test1[names[0] + "_dups"].shape[0]) + + " duplicates in " + + str(names[0]) + + "\nThere are " + + str(dict_test1[names[1] + "_dups"].shape[0]) + + " duplicates in " + + str(names[1]) + + "\nDataFrames are not the same\n" + ) + +# -------------ValueError tests----------------- + +# -------------File Search---------------------- + + +@pytest.mark.parametrize("like", [("txt")]) +def test_file_search_ValueError_1(like): + + with pytest.raises(ValueError): + + file_search(like=like) + + +@pytest.mark.parametrize("path", [(1)]) +def test_file_search_ValueError_2(path): + + with pytest.raises(ValueError): + + file_search(path=path) + + +@pytest.mark.parametrize("doctype", [(["txt"])]) +def test_file_search_ValueError_3(doctype): + + with pytest.raises(ValueError): + + file_search(doctype=doctype) + + +@pytest.mark.parametrize("strict", [("True")]) +def test_file_search_ValueError_4(strict): + + with pytest.raises(ValueError): + + file_search(strict=strict) + + +# -----------------Import files------------------------- + + +@pytest.mark.parametrize("like", [("txt")]) +def test_import_files_ValueError_1(like): + + with pytest.raises(ValueError): + + import_files(like=like) + + +@pytest.mark.parametrize("subdir", [("True")]) +def test_import_files_ValueError_2(subdir): + + with pytest.raises(ValueError): + + import_files(subdir=subdir) + + +@pytest.mark.parametrize("doctype", [(["txt"])]) +def test_import_files_ValueError_3(doctype): + + with pytest.raises(ValueError): + + import_files(doctype=doctype) + + +@pytest.mark.parametrize("sheet", [(1)]) +def test_import_files_ValueError_4(sheet): + + with pytest.raises(ValueError): + + import_files(sheet=sheet) + + +@pytest.mark.parametrize("path", [(["Desktop"])]) +def test_import_files_ValueError_5(path): + + with pytest.raises(ValueError): + + import_files(path=path) + + +@pytest.mark.parametrize("strict", [("True")]) +def test_import_files_ValueError_6(strict): + + with pytest.raises(ValueError): + + import_files(strict=strict) + + +# ---------------Compare-------------------------- + + +@pytest.mark.parametrize("names", [("txt")]) +def test_compare_ValueError_1(names): + + with pytest.raises(ValueError): + + compare(df1, df2, names=names) + + +@pytest.mark.parametrize("x", [([1, 2, 3])]) +def test_compare_ValueError_2(x): + + with pytest.raises(ValueError): + + compare(x, df2, names=["x", "df2"]) + + +@pytest.mark.parametrize("dups", [("True")]) +def test_compare_ValueError_3(dups): + + with pytest.raises(ValueError): + + compare(df1, df2, names=["df1", "df2"], dups=dups) + + +@pytest.mark.parametrize("same", [("True")]) +def test_compare_ValueError_4(same): + + with pytest.raises(ValueError): + + compare(df1, df2, names=["df1", "df2"], same=same) + + +@pytest.mark.parametrize("comment", [("True")]) +def test_compare_ValueError_5(comment): + + with pytest.raises(ValueError): + + compare(df1, df2, names=["df1", "df2"], comment=comment) + + +@pytest.mark.parametrize("y", [([1, 2, 3])]) +def test_compare_ValueError_6(y): + + with pytest.raises(ValueError): + + compare(df1, y, names=["df1", "y"]) diff --git a/codonPython/tests/nhsNumber_test.py b/codonPython/tests/nhsNumber_test.py index bbbdebb..c1b966a 100644 --- a/codonPython/tests/nhsNumber_test.py +++ b/codonPython/tests/nhsNumber_test.py @@ -1,41 +1,30 @@ from codonPython.nhsNumber import nhsNumberGenerator, nhsNumberValidator -import numpy as np import pytest import random -@pytest.mark.parametrize("to_generate, random_state, expected", [ - (3, 42, [8429141456, 2625792787, 8235363119]), - (2, 1, [9598980006, 6597925149]) -]) +@pytest.mark.parametrize( + "to_generate, random_state, expected", + [(3, 42, [8429141456, 2625792787, 8235363119]), (2, 1, [9598980006, 6597925149])], +) def test_nhsNumberGenerator_BAU(to_generate, random_state, expected): - assert expected == nhsNumberGenerator( - to_generate, random_state=random_state) + assert expected == nhsNumberGenerator(to_generate, random_state=random_state) -@pytest.mark.parametrize("to_generate", [ - 4.2, - 1000001, - -1 -]) +@pytest.mark.parametrize("to_generate", [4.2, 1000001, -1]) def test_nhsNumberGenerator_valueErrors(to_generate): with pytest.raises(ValueError): nhsNumberGenerator(to_generate) -@pytest.mark.parametrize("to_validate, expected", [ - (9598980006, True), - (9598980007, False) -]) +@pytest.mark.parametrize( + "to_validate, expected", [(9598980006, True), (9598980007, False)] +) def test_nhsNumberValidator_BAU(to_validate, expected): assert expected == nhsNumberValidator(to_validate) -@pytest.mark.parametrize("to_validate", [ - 4.2, - 1000001, - -1 -]) +@pytest.mark.parametrize("to_validate", [4.2, 1000001, -1]) def test_nhsNumberValidator_valueErrors(to_validate): with pytest.raises(ValueError): nhsNumberValidator(to_validate) diff --git a/codonPython/tests/suppression_test.py b/codonPython/tests/suppression_test.py index 6af0e37..9e8f04f 100644 --- a/codonPython/tests/suppression_test.py +++ b/codonPython/tests/suppression_test.py @@ -1,25 +1,16 @@ -from codonPython.suppression import suppress_value +from codonPython.suppression import central_suppression_method import pytest -@pytest.mark.parametrize("to_suppress, expected", [ - (0, "0"), - (2, "*"), - (5, "*"), - (8, "10"), - (16, "15"), - (57, "55"), - (10023, "10025") -]) -def test_suppress_value_BAU(to_suppress, expected): - assert expected == suppress_value(to_suppress) +@pytest.mark.parametrize( + "to_suppress, expected", + [(0, "0"), (2, "5"), (5, "5"), (8, "10"), (16, "15"), (57, "55"), (10023, "10025")], +) +def test_central_suppression_method_BAU(to_suppress, expected): + assert expected == central_suppression_method(to_suppress) -@pytest.mark.parametrize("to_suppress", [ - -1, - 4.2, - 100000001 -]) +@pytest.mark.parametrize("to_suppress", [-1, 4.2, 5000000001]) def test_suppress_value_valueErrors(to_suppress): with pytest.raises(ValueError): - suppress_value(to_suppress) + central_suppression_method(to_suppress) diff --git a/codonPython/tests/tolerance_test.py b/codonPython/tests/tolerance_test.py index b12db95..04e2fa7 100644 --- a/codonPython/tests/tolerance_test.py +++ b/codonPython/tests/tolerance_test.py @@ -1,108 +1,102 @@ -from codonPython.tolerance import check_tolerance +from codonPython.validation.tolerance import check_tolerance import numpy as np import pandas as pd import pandas.util.testing as pdt import pytest -## TODO migrate from numpy arrays to pandas series/dataframes testdata = [ pd.Series([1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242]), pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, 7]), ] -@pytest.mark.parametrize("t, y, to_exclude, poly_features, alpha, parse_dates, expected", [ - ( - *testdata, - 2, - [1, 2], - 0.05, - False, - pd.DataFrame({ - "t" : [1241, 1242, 1241, 1242], - 'yhat_u': [ - 8.11380197739608, - 9.051653693670929, - 7.127135023632205, - 7.735627110021585, - ], - 'yobs': [6.5, 7.0, 6.5, 7.0], - 'yhat': [ - 7.214285714285714, - 8.071428571428573, - 6.500000000000002, - 6.821428571428574, - ], - 'yhat_l': [ - 6.31476945117535, - 7.091203449186216, - 5.872864976367799, - 5.907230032835563, - ], - 'polynomial': [1, 1, 2, 2] - }), - ), - ( - *testdata, - 2, - [3], - 0.05, - False, - pd.DataFrame({ - "t" : [1241, 1242], - 'yhat_u': [ - 6.753927165005773, - 7.214574732953706, - ], - 'yobs': [6.5, 7.0], - 'yhat': [ - 6.0000000000000036, - 5.571428571428576, - ], - 'yhat_l': [ - 5.2460728349942345, - 3.928282409903445, - ], - 'polynomial': [3, 3] - }), - ), - ( - pd.Series([ # Check dates - "2012-05-16", - "2012-05-17", - "2012-05-18", - "2012-05-19", - "2012-05-20", - "2012-05-21", - "2012-05-22", - "2012-05-23", - "2012-05-24", - ]), - pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, 7]), - 2, - [3], - 0.05, - True, - pd.DataFrame({ - "t" : ["2012-05-23", "2012-05-24"], - 'yhat_u': [ - 6.753927165005773, - 7.214574732953706, - ], - 'yobs': [6.5, 7.0], - 'yhat': [ - 6.0000000000000036, - 5.571428571428576, - ], - 'yhat_l': [ - 5.2460728349942345, - 3.928282409903445, - ], - 'polynomial': [3, 3] - }), - ), -]) -def test_tolerance_checking_BAU(t, y, to_exclude, poly_features, alpha, parse_dates, expected): +@pytest.mark.parametrize( + "t, y, to_exclude, poly_features, alpha, parse_dates, expected", + [ + ( + *testdata, + 2, + [1, 2], + 0.05, + False, + pd.DataFrame( + { + "t": [1241, 1242, 1241, 1242], + "yhat_u": [ + 8.11380197739608, + 9.051653693670929, + 7.127135023632205, + 7.735627110021585, + ], + "yobs": [6.5, 7.0, 6.5, 7.0], + "yhat": [ + 7.214285714285714, + 8.071428571428573, + 6.500000000000002, + 6.821428571428574, + ], + "yhat_l": [ + 6.31476945117535, + 7.091203449186216, + 5.872864976367799, + 5.907230032835563, + ], + "polynomial": [1, 1, 2, 2], + } + ), + ), + ( + *testdata, + 2, + [3], + 0.05, + False, + pd.DataFrame( + { + "t": [1241, 1242], + "yhat_u": [6.753927165005773, 7.214574732953706], + "yobs": [6.5, 7.0], + "yhat": [6.0000000000000036, 5.571428571428576], + "yhat_l": [5.2460728349942345, 3.928282409903445], + "polynomial": [3, 3], + } + ), + ), + ( + pd.Series( + [ # Check dates + "2012-05-16", + "2012-05-17", + "2012-05-18", + "2012-05-19", + "2012-05-20", + "2012-05-21", + "2012-05-22", + "2012-05-23", + "2012-05-24", + ] + ), + pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, 7]), + 2, + [3], + 0.05, + True, + pd.DataFrame( + { + "t": ["2012-05-23", "2012-05-24"], + "yhat_u": [6.753927165005773, 7.214574732953706], + "yobs": [6.5, 7.0], + "yhat": [6.0000000000000036, 5.571428571428576], + "yhat_l": [5.2460728349942345, 3.928282409903445], + "polynomial": [3, 3], + } + ), + ), + ], +) +def test_tolerance_checking_BAU( + t, y, to_exclude, poly_features, alpha, parse_dates, expected +): obtained = check_tolerance( t, y, @@ -114,68 +108,52 @@ def test_tolerance_checking_BAU(t, y, to_exclude, poly_features, alpha, parse_da pdt.assert_frame_equal(expected, obtained) -@pytest.mark.parametrize("t, y, to_exclude, poly_features, alpha", [ - ( - *testdata, - 2, - "flamingo", # This should be a list - 0.05, - ), - ( - *testdata, - 2, - [2], - "flamingo", # Needs to be int - ), - ( - *testdata, - 2, - [2], - 42, # Needs to be between 0 and 1 - ), - ( - *testdata, - "flamingo", # Needs to be int - [2], - 0.05, - ), -]) +@pytest.mark.parametrize( + "t, y, to_exclude, poly_features, alpha", + [ + (*testdata, 2, "flamingo", 0.05), # This should be a list + (*testdata, 2, [2], "flamingo"), # Needs to be int + (*testdata, 2, [2], 42), # Needs to be between 0 and 1 + (*testdata, "flamingo", [2], 0.05), # Needs to be int + ], +) def test_ValueErrors(t, y, to_exclude, poly_features, alpha): with pytest.raises(ValueError): - check_tolerance(t, y, to_exclude=to_exclude, - poly_features=poly_features, alpha=alpha) + check_tolerance( + t, y, to_exclude=to_exclude, poly_features=poly_features, alpha=alpha + ) -@pytest.mark.parametrize("t, y, to_exclude, poly_features, alpha", [ - ( - *testdata, - 2, - [42], # Elements in the list should be between 0 and 4 - 0.05, - ), - ( - *testdata, - 42, # Can't have to_exclude making your sample size smaller than 4 - [2], - 0.05, - ), - ( - pd.Series([1234, 1235, 1236, 1237, 1238, 1239, - 1240, 1241, np.nan]), # Missing t value - pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, 7]), - 2, - [2], - 0.05, - ), - ( - pd.Series([1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242]), - pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, np.nan]), # Missing y value - 2, - [2], - 0.05, - ) -]) +@pytest.mark.parametrize( + "t, y, to_exclude, poly_features, alpha", + [ + (*testdata, 2, [42], 0.05), # Elements in the list should be between 0 and 4 + ( + *testdata, + 42, # Can't have to_exclude making your sample size smaller than 4 + [2], + 0.05, + ), + ( + pd.Series( + [1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, np.nan] + ), # Missing t value + pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, 7]), + 2, + [2], + 0.05, + ), + ( + pd.Series([1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242]), + pd.Series([1, 2, 3, 4, 5, 5.5, 6, 6.5, np.nan]), # Missing y value + 2, + [2], + 0.05, + ), + ], +) def test_AssertionErrors(t, y, to_exclude, poly_features, alpha): with pytest.raises(AssertionError): - check_tolerance(t, y, to_exclude=to_exclude, - poly_features=poly_features, alpha=alpha) + check_tolerance( + t, y, to_exclude=to_exclude, poly_features=poly_features, alpha=alpha + ) diff --git a/codonPython/check_consistent_measures.py b/codonPython/validation/check_consistent_measures.py similarity index 86% rename from codonPython/check_consistent_measures.py rename to codonPython/validation/check_consistent_measures.py index 4e8896e..ea68796 100644 --- a/codonPython/check_consistent_measures.py +++ b/codonPython/validation/check_consistent_measures.py @@ -2,7 +2,12 @@ import numpy as np -def check_consistent_measures(data, geography_col: str = "Org_Level", measure_col: str = "Measure", measures_set: set = set()) -> bool: +def check_consistent_measures( + data, + geography_col: str = "Org_Level", + measure_col: str = "Measure", + measures_set: set = set(), +) -> bool: """ Check every measure is in every geography level. @@ -48,7 +53,7 @@ def check_consistent_measures(data, geography_col: str = "Org_Level", measure_co if data.isna().any(axis=None): raise ValueError( - f"Missing values at locations {list(map(tuple, np.argwhere(data.isna().values)))}" + f"Missing values at locations {list(map(tuple, np.argwhere(data.isna().values)))}" ) if not isinstance(geography_col, str) or not isinstance(measure_col, str): raise ValueError("Please input strings for column indexes.") @@ -59,8 +64,7 @@ def check_consistent_measures(data, geography_col: str = "Org_Level", measure_co # Every geography level should have the same set of measures as the global set. global_set = measures_set if measures_set else set(data[measure_col].unique()) - subsets = data.groupby(geography_col) \ - .agg({measure_col: "unique"}) + subsets = data.groupby(geography_col).agg({measure_col: "unique"}) subset_agreement = all(set(x) == global_set for x in subsets[measure_col]) return subset_agreement diff --git a/codonPython/check_consistent_submissions.py b/codonPython/validation/check_consistent_submissions.py similarity index 68% rename from codonPython/check_consistent_submissions.py rename to codonPython/validation/check_consistent_submissions.py index c823b97..ebaaa48 100644 --- a/codonPython/check_consistent_submissions.py +++ b/codonPython/validation/check_consistent_submissions.py @@ -1,7 +1,13 @@ import pandas as pd -def check_consistent_submissions(data, national_geog_level: str = "National", geography_col: str = "Org_Level", submissions_col: str = "Value_Unsuppressed", measure_col: str = "Measure", ) -> bool: +def check_consistent_submissions( + data, + national_geog_level: str = "National", + geography_col: str = "Org_Level", + submissions_col: str = "Value_Unsuppressed", + measure_col: str = "Measure", +) -> bool: """ Check total submissions for each measure are the same across all geography levels except national. @@ -49,24 +55,28 @@ def check_consistent_submissions(data, national_geog_level: str = "National", ge """ if ( - not isinstance(submissions_col, str) or - not isinstance(measure_col, str) or - not isinstance(geography_col, str) or - not isinstance(national_geog_level, str) + not isinstance(submissions_col, str) + or not isinstance(measure_col, str) + or not isinstance(geography_col, str) + or not isinstance(national_geog_level, str) ): - raise ValueError("Please input strings for column names and national geography level.") + raise ValueError( + "Please input strings for column names and national geography level." + ) if ( - submissions_col not in data.columns or - measure_col not in data.columns or - geography_col not in data.columns + submissions_col not in data.columns + or measure_col not in data.columns + or geography_col not in data.columns ): raise KeyError("Check column names correspond to the DataFrame.") # All non-national measures should have only one unique submission number for each # geography level. - submissions_by_measure = data[data[geography_col] != national_geog_level] \ - .groupby(measure_col) \ - .agg({submissions_col: "nunique"}) + submissions_by_measure = ( + data[data[geography_col] != national_geog_level] + .groupby(measure_col) + .agg({submissions_col: "nunique"}) + ) result = (submissions_by_measure[submissions_col] == 1).all() return result diff --git a/codonPython/check_nat_val.py b/codonPython/validation/check_nat_val.py similarity index 73% rename from codonPython/check_nat_val.py rename to codonPython/validation/check_nat_val.py index a309b42..56d0501 100644 --- a/codonPython/check_nat_val.py +++ b/codonPython/validation/check_nat_val.py @@ -1,9 +1,13 @@ import pandas as pd -def check_nat_val(df: pd.DataFrame, breakdown_col: str = "Breakdown", - measure_col: str = "Measure", value_col: str = - "Value_Unsuppressed", nat_val: str = "National") -> bool: +def check_nat_val( + df: pd.DataFrame, + breakdown_col: str = "Breakdown", + measure_col: str = "Measure", + value_col: str = "Value_Unsuppressed", + nat_val: str = "National", +) -> bool: """ Check national value less than or equal to sum of breakdowns. @@ -66,24 +70,32 @@ def check_nat_val(df: pd.DataFrame, breakdown_col: str = "Breakdown", False """ - if not isinstance(breakdown_col, str) or not isinstance(measure_col, str)\ - or not isinstance(value_col, str): + if ( + not isinstance(breakdown_col, str) + or not isinstance(measure_col, str) + or not isinstance(value_col, str) + ): raise ValueError("Please input strings for column indexes.") if not isinstance(nat_val, str): raise ValueError("Please input strings for value indexes.") - if breakdown_col not in df.columns or measure_col not in df.columns or\ - value_col not in df.columns: + if ( + breakdown_col not in df.columns + or measure_col not in df.columns + or value_col not in df.columns + ): raise KeyError("Check column names correspond to the DataFrame.") -# aggregate values by measure and breakdown - grouped = df.groupby([measure_col, breakdown_col]).agg({value_col: sum})\ - .reset_index() + # aggregate values by measure and breakdown + grouped = ( + df.groupby([measure_col, breakdown_col]).agg({value_col: sum}).reset_index() + ) national = grouped.loc[grouped[breakdown_col] == nat_val].reset_index() non_national = grouped.loc[grouped[breakdown_col] != nat_val].reset_index() -# check values are less than or equal to national value for each measure - join = pd.merge(non_national, national, left_on=measure_col, - right_on=measure_col, how='left') - left = value_col + '_x' - right = value_col + '_y' - join['Check'] = join[right] <= join[left] - result = all(join['Check']) + # check values are less than or equal to national value for each measure + join = pd.merge( + non_national, national, left_on=measure_col, right_on=measure_col, how="left" + ) + left = value_col + "_x" + right = value_col + "_y" + join["Check"] = join[right] <= join[left] + result = all(join["Check"]) return result diff --git a/codonPython/check_null.py b/codonPython/validation/check_null.py similarity index 80% rename from codonPython/check_null.py rename to codonPython/validation/check_null.py index dfbdd4c..37333b6 100644 --- a/codonPython/check_null.py +++ b/codonPython/validation/check_null.py @@ -1,11 +1,13 @@ import numpy import pandas as pd -def check_null(dataframe: pd.DataFrame, columns_to_be_checked: list) -> bool: + +def check_null(dataframe: pd.DataFrame, columns_to_be_checked: list) -> int: """ Checks a pandas dataframe for null values - This function takes a pandas dataframe supplied as an argument and returns a integer value representing any null values found within the columns to check + This function takes a pandas dataframe supplied as an argument and returns a integer value + representing any null values found within the columns to check. Parameters ---------- @@ -29,18 +31,16 @@ def check_null(dataframe: pd.DataFrame, columns_to_be_checked: list) -> bool: if not isinstance(columns_to_be_checked, list): raise ValueError("Please make sure that all your columns passed are strings") - else: - pass for eachCol in columns_to_be_checked: if eachCol not in dataframe.columns: - raise KeyError("Please check the column names correspond to values in the DataFrame.") - else: - pass + raise KeyError( + "Please check the column names correspond to values in the DataFrame." + ) null_count = 0 for eachColumn in columns_to_be_checked: prev_null_count = null_count null_count = prev_null_count + (len(dataframe) - dataframe[eachColumn].count()) - return null_count \ No newline at end of file + return null_count diff --git a/codonPython/dateValidator.py b/codonPython/validation/dateValidator.py similarity index 66% rename from codonPython/dateValidator.py rename to codonPython/validation/dateValidator.py index 2765de7..3e4fd0e 100644 --- a/codonPython/dateValidator.py +++ b/codonPython/validation/dateValidator.py @@ -3,7 +3,7 @@ def validDate(date_string: str) -> bool: """ - Validates stringtype dates of type `dd/mm/yyyy`, `dd-mm-yyyy` or `dd.mm.yyyy` from + Validates stringtype dates of type `dd/mm/yyyy`, `dd-mm-yyyy` or `dd.mm.yyyy` from years 1900-9999. Leap year support included. Parameters @@ -33,13 +33,15 @@ def validDate(date_string: str) -> bool: # https://stackoverflow.com/questions/15491894/regex-to-validate-date-format-dd-mm-yyyy # modified to confine the year dates. if re.match( - r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1" + - r"|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2" + - r"))(?:(?:1[9]..|2[0][0-4].))$|^(?:29(\/|-|\.)0?2\3" + - r"(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]" + - r"|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4" + - r"(?:(?:1[9]..|2[0][0-4].))$", - date_string, flags=0): + r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1" + + r"|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2" + + r"))(?:(?:1[9]..|2[0][0-4].))$|^(?:29(\/|-|\.)0?2\3" + + r"(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]" + + r"|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4" + + r"(?:(?:1[9]..|2[0][0-4].))$", + date_string, + flags=0, + ): return True else: return False diff --git a/codonPython/tolerance.py b/codonPython/validation/tolerance.py similarity index 74% rename from codonPython/tolerance.py rename to codonPython/validation/tolerance.py index 0406f7f..b3f0d14 100644 --- a/codonPython/tolerance.py +++ b/codonPython/validation/tolerance.py @@ -7,7 +7,15 @@ from statsmodels.sandbox.regression.predstd import wls_prediction_std -def check_tolerance(t, y, to_exclude: int = 1, poly_features: list = [1, 2], alpha: float = 0.05, parse_dates: bool = False, predict_all: bool = False) -> pd.DataFrame: +def check_tolerance( + t, + y, + to_exclude: int = 1, + poly_features: list = [1, 2], + alpha: float = 0.05, + parse_dates: bool = False, + predict_all: bool = False, +) -> pd.DataFrame: """ Check that some future values are within a weighted least squares confidence interval. @@ -58,32 +66,31 @@ def check_tolerance(t, y, to_exclude: int = 1, poly_features: list = [1, 2], alp """ if not isinstance(poly_features, list): - raise ValueError("Please input a list of integers from 0 to 4 for poly_features.") - assert all(0 <= degree <= 4 for degree in poly_features), ( - "Please ensure all numbers in poly_features are from 0 to 4." - ) + raise ValueError( + "Please input a list of integers from 0 to 4 for poly_features." + ) + assert all( + 0 <= degree <= 4 for degree in poly_features + ), "Please ensure all numbers in poly_features are from 0 to 4." if not isinstance(alpha, float) or 0 > alpha >= 1: raise ValueError("Please input a float between 0 and 1 for alpha.") if not isinstance(to_exclude, int): - raise ValueError("Please input an integer between 1 and your sample size for to_exclude.") - assert ((len(t) - to_exclude) >= 4), ( - """The sample size for your model is smaller than 4. This will not produce a good + raise ValueError( + "Please input an integer between 1 and your sample size for to_exclude." + ) + assert ( + len(t) - to_exclude + ) >= 4, """The sample size for your model is smaller than 4. This will not produce a good model. Either reduce to_exclude or increase your sample size to continue.""" - ) - assert y.notna().all(), ( - f"""Your sample contains missing or infinite values for y at locations + assert y.notna().all(), f"""Your sample contains missing or infinite values for y at locations {list(map(tuple, np.where(np.isnan(y))))}. Exclude these values to continue.""" - ) - assert t.notna().all(), ( - f"""Your sample contains missing or infinite values for t at locations + assert t.notna().all(), f"""Your sample contains missing or infinite values for t at locations {list(map(tuple, np.where(np.isnan(t))))}. Exclude these values to continue.""" - ) # Convert date strings to numeric variables for the model if parse_dates: t_numeric = pd.to_datetime(t) - t_numeric = (t_numeric - datetime(1970, 1, 1)) \ - .apply(lambda x: x.days) + t_numeric = (t_numeric - datetime(1970, 1, 1)).apply(lambda x: x.days) # Sort data by t increasing. t_ is for internal use. idx = np.argsort(t_numeric.values) if parse_dates else np.argsort(t.values) @@ -93,10 +100,7 @@ def check_tolerance(t, y, to_exclude: int = 1, poly_features: list = [1, 2], alp results = pd.DataFrame() for degree in poly_features: - transforms = make_pipeline( - StandardScaler(), - PolynomialFeatures(degree=degree), - ) + transforms = make_pipeline(StandardScaler(), PolynomialFeatures(degree=degree)) # Fit transforms to training data only, apply to all data. fitted_transforms = transforms.fit(t_[:-to_exclude].values.reshape(-1, 1)) @@ -108,7 +112,7 @@ def check_tolerance(t, y, to_exclude: int = 1, poly_features: list = [1, 2], alp y if predict_all else y[-to_exclude:], t if predict_all else t[-to_exclude:], ) - + # Fit ordinary least squares model to the training data, then predict for the # prediction data. model = sm.OLS(y_train, t_train).fit() @@ -119,16 +123,17 @@ def check_tolerance(t, y, to_exclude: int = 1, poly_features: list = [1, 2], alp # Store model results in master frame results = results.append( - pd.DataFrame({ - "t" : t_orig, - "yhat_u" : yhat_u, - "yobs" : y_predict, - "yhat" : yhat, - "yhat_l" : yhat_l, - "polynomial" : degree - }), + pd.DataFrame( + { + "t": t_orig, + "yhat_u": yhat_u, + "yobs": y_predict, + "yhat": yhat, + "yhat_l": yhat_l, + "polynomial": degree, + } + ), ignore_index=True, ) return results - \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..6f6536f --- /dev/null +++ b/docs/README.md @@ -0,0 +1,3 @@ +# Documentation files + +The files that Sphinx builds the documentation from are located here. ReStructuredText `.rst` files are text files similar to markdown which allow formatting and interactivity with Sphinx. As with the functions in codon, improvements to the documentation files here are welcolmed. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/codonPython.rst b/docs/source/codonPython.rst new file mode 100644 index 0000000..e3315e4 --- /dev/null +++ b/docs/source/codonPython.rst @@ -0,0 +1,143 @@ +codonPython package +=================== + +Submodules +---------- + +codonPython.age\_bands module +----------------------------- + +.. automodule:: codonPython.age_bands + :members: + :undoc-members: + :show-inheritance: + +codonPython.check\_consistent\_measures module +---------------------------------------------- + +.. automodule:: codonPython.check_consistent_measures + :members: + :undoc-members: + :show-inheritance: + +codonPython.check\_consistent\_submissions module +------------------------------------------------- + +.. automodule:: codonPython.check_consistent_submissions + :members: + :undoc-members: + :show-inheritance: + +codonPython.check\_nat\_val module +---------------------------------- + +.. automodule:: codonPython.check_nat_val + :members: + :undoc-members: + :show-inheritance: + +codonPython.check\_null module +------------------------------ + +.. automodule:: codonPython.check_null + :members: + :undoc-members: + :show-inheritance: + +codonPython.dateValidator module +-------------------------------- + +.. automodule:: codonPython.dateValidator + :members: + :undoc-members: + :show-inheritance: + +codonPython.file\_utils module +------------------------------ + +.. automodule:: codonPython.file_utils + :members: + :undoc-members: + :show-inheritance: + +codonPython.mesh module +----------------------- +.. automodule:: codonPython.mesh + :members: + :imported-members: + :no-undoc-members: + :exclude-members: dataclass + :show-inheritance: + :member-order: bysource + +Requirements +++++++++++++ +Using the MESH API requires a valid API certificate issued by DIR (for live environments) or Platforms (for development/test environments). +Guidance on obtaining a certificate can be found `in the MESH guidance hub `_. +The module requires the certificate be in PEM format; the certificate enrolment tool produces a java keystore which the valid certificate can be extracted from if needs be. +Due to limitations inherited from the Requests library, the private key *must* be unencrypted. + +The keystore will also contain root authority certificates; these should also be extracted and combined into a certificate bundle for use in confirming the identity of the endpoint being communicated with. This check *can* be disabled, but this is not recommended. + +Finally, use of the MESH API requires the API secret key, which can be requested from Platforms. + +Example usage ++++++++++++++ +.. include:: example_usage/mesh.rst + +codonPython.nhsd\_colours module +-------------------------------- + +.. automodule:: codonPython.nhsd_colours + :members: + :undoc-members: + :show-inheritance: + +codonPython.nhsNumber module +---------------------------- + +.. automodule:: codonPython.nhsNumber + :members: + :undoc-members: + :show-inheritance: + +codonPython.ODS_lookup module +----------------------------- + +.. automodule:: codonPython.ODS_lookup + :members: + :undoc-members: + :show-inheritance: + +codonPython.suppression module +------------------------------ + +.. automodule:: codonPython.suppression + :members: + :undoc-members: + :show-inheritance: + +codonPython.tableFromSql module +------------------------------- + +.. automodule:: codonPython.tableFromSql + :members: + :undoc-members: + :show-inheritance: + +codonPython.tolerance module +---------------------------- + +.. automodule:: codonPython.tolerance + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: codonPython + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..320133c --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,59 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath("../../")) +sys.setrecursionlimit(1500) + +# -- Project information ----------------------------------------------------- + +project = 'codon' +copyright = 'under BSD-3 license' +author = 'PH, PE, NC, GR, MM' + +# The full version, including alpha/beta/rc tags +release = '0.0.21' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.githubpages', + 'm2r', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/source/example_usage/mesh.rst b/docs/source/example_usage/mesh.rst new file mode 100644 index 0000000..3dbfe35 --- /dev/null +++ b/docs/source/example_usage/mesh.rst @@ -0,0 +1,62 @@ +Use of the MESH API will generally follow one of a few standard patterns. + +Upload file pattern: + +* Check Authentication with the MESH API endpoint +* Send one or more files (chunked if needed) + +.. code-block:: + + from codonPython.mesh import MESHConnection + mesh_client = MESHConnection( + mailbox = 'test' + password = 'test' + api_shared_key = 'test' + cert_loc = './certs/test.cert' + key_loc = './certs/test.key' + base_ca_loc = './certs/test.ca-bundle') + + if mesh_client.check_authentication(): + mesh_client.send_file('test_recipient', './test/test_20200224_1100.txt.dat', 'test_workflow') + + +Download file pattern: + +* Check Authentication with the MESH API endpoint +* Request a list of files to download from the endpoint +* Download and process each in turn (downloading chunked files if needed) +* After each file has been *successfully* processed, send acknowledgement to the MESH API for that file +* Repeat as needed until there are no more files which can be processed. Note that the MESH API will return at most 500 message IDs at a time. + +.. code-block:: + + from codonPython.mesh import MESHConnection + mesh_client = MESHConnection( + mailbox = 'test' + password = 'test' + api_shared_key = 'test' + cert_loc = './certs/test.cert' + key_loc = './certs/test.key' + base_ca_loc = './certs/test.ca-bundle') + + if mesh_client.check_authentication(): + # To save all messages to a folder + mesh_client.check_and_download('./inbox') + # To perform more complex processing on each + for message in mesh_client.check_and_download(): + process(message) + # To perform the flow manually instead of using the check_and_download helper method + errors = [] + inbox_messages = mesh_client.check_inbox() + for message_id in inbox_messages: + try: + message = mesh_client.download_message(message_id, './inbox') + process(message) + mesh_client.ack_download_message(message_id) + except Exception as e: + errors.append(e) + + +NB: +The regular MESH client has strict restrictions on filename and type. The API does not have any such restrictions, however it is likely that the system files are being sent to expects files to be in this format. +Unless you know otherwise files should be sent with .dat extension, with filename pattern Organisation_Date_Time.ext.dat where .ext is the original file extension. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..31dfff6 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,17 @@ +Welcome to the Documentation! +============================= + +.. mdinclude:: ../../README.md + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + modules + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..93d91fd --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +codonPython +=========== + +.. toctree:: + :maxdepth: 4 + + codonPython diff --git a/docs/source/setup.rst b/docs/source/setup.rst new file mode 100644 index 0000000..552eb49 --- /dev/null +++ b/docs/source/setup.rst @@ -0,0 +1,7 @@ +setup module +============ + +.. automodule:: setup + :members: + :undoc-members: + :show-inheritance: diff --git a/requirements.txt b/requirements.txt index c2ffd37..93f5b3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,15 @@ numpy>=1.16.0 scipy>=0.19.0 pandas>=0.24.0 -sqlalchemy>=1.3.5 +sqlalchemy>=1.3.12 +pyodbc scikit-learn>=0.21.2 -statsmodels>=0.10.0 \ No newline at end of file +statsmodels>=0.10.0 +seaborn>=0.9.0 +sphinx==2.2.2 +sphinx-rtd-theme>=0.4.3 +flake8>=3.7.9 +m2r>=0.2.1 +requests>=2.22.0 +requests-mock>=1.7.0 +dataclasses>=0.7; python_version == '3.6' diff --git a/setup.py b/setup.py index 4c12ee5..358008e 100644 --- a/setup.py +++ b/setup.py @@ -5,9 +5,9 @@ setup( name='codonPython', - version='0.2.1', + version='0.2.3.1', license='BSD', - packages=['codonPython', ], + packages=find_packages(), install_requires=requirements, author='NHS Digital DIS Team', author_email='paul.ellingham@nhs.net',