Skip to content

AFES

explore_files(path)

Return a dataframe with all the files.

Parameters:

Name Type Description Default
path str | Path

Path the file or to the directory with files.

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with description of the files.

Source code in src/afes/afe.py
def explore_files(path: str | Path) -> pd.DataFrame:
    """Return a dataframe with all the files.

    Args:
        path (str | Path): Path the file or to the directory with files.

    Returns:
        pd.DataFrame: DataFrame with description of the files.
    """
    path = Path(path)
    all_files = _get_files(path)
    df = _get_descriptions(all_files=all_files)

    # Determine the separator
    df["separator"] = None
    pbar = tqdm(range(len(df)), total=len(df))
    for i in pbar:
        pbar.set_description(f"{df.iloc[i]['name']} ({df.iloc[i]['rows']:,} records)")
        if df.iloc[i]["extension"] in PLAIN_FORMATS:
            sep = get_separator(df.iloc[i]["path"])
            df.at[i, "separator"] = sep

    return df

generate_code(df, python_file='code.txt', verbose=True)

Generate pandas code to load the files.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with the explored files.

required
python_file str

Name of the file to save the code. Defaults to "code.txt".

'code.txt'
verbose bool

Flag to print the code. Defaults to True.

True
Source code in src/afes/afe.py
def generate_code(
    df: pd.DataFrame,
    python_file: str = "code.txt",
    verbose: bool = True,
):
    """Generate pandas code to load the files.

    Args:
        df (pd.DataFrame): DataFrame with the explored files.
        python_file (str, optional): Name of the file to save the code.
            Defaults to "code.txt".
        verbose (bool, optional): Flag to print the code. Defaults to True.
    """
    generate_pandas_code(df, python_file=python_file, verbose=verbose)

profile_files(df, output_path='.', profile_tool='ydata-profiling')

Profile the structured data.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with the files to be profiled.

required
output_path str | Path

Folder to save the HTML reports. Defaults to ".".

'.'
profile_tool str

Select which profiling too to use. Defaults to "ydata-profiling".

'ydata-profiling'
Source code in src/afes/afe.py
def profile_files(
    df: pd.DataFrame,
    output_path: str | Path = ".",
    profile_tool: str = "ydata-profiling",
):
    """Profile the structured data.

    Args:
        df (pd.DataFrame): DataFrame with the files to be profiled.
        output_path (str | Path, optional): Folder to save the HTML reports.
            Defaults to ".".
        profile_tool (str, optional): Select which profiling too to use.
            Defaults to "ydata-profiling".
    """
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    df.sort_values(by="size", inplace=True)
    print(
        f"Profiling files with {profile_tool} and generating reports in folder {output_path}"
    )
    pbar = tqdm(df.iterrows(), total=len(df))
    for _, r in pbar:
        pbar.set_description(f"Profiling {r['name']} ({r['rows']:,} records)")
        if r.rows > 0:
            df_to_profile = load_file_with_pandas(
                file_path=r["path"],
                file_name=r["name"],
                extension=r["extension"],
                sep=r["separator"],
            )
            if profile_tool == "ydata-profiling":
                profile_with_ydata_profiling(
                    output_path=output_path,
                    df_to_profile=df_to_profile,
                    file_name=r["name"],
                    file_size=r["size"],
                )
            elif profile_tool == "sweetviz":
                profile_with_sweetviz(
                    df_to_profile=df_to_profile,
                    output_path=output_path,
                    file_name=r["name"],
                )

    print(f'\nCheck out all the reports in "{output_path.resolve()}"\n')
    return