%%HTML
<pre class="mermaid">
    flowchart LR
        subgraph The right way
        direction LR  %%
        A[Source Dataset \n e.g. IRIS ]
        A --> B{fa:fa-spinner Random Split}
        B --> C(StandardScaler \nFit) --> |Mean and Std| X(StandardScaler \nTransform) --> Y[Train]
        C--> |Mean and Std| E
        B --> E(StandardScaler \nTransform) --> F[Test]
        C--> |Mean and Std| G
        B --> G(StandardScaler \nTransform) --> H[Validation]
        end
</pre>
<pre class="mermaid">
    flowchart LR
        subgraph The wrong way
        direction LR  %%
        A[Source Dataset \n e.g. IRIS] --> B(StandardScaler \nFit & Transform)
        B --> C{fa:fa-spinner Random Split}
        C --> D[Train]
        C --> E[Test]
        C --> F[Validation]
          end
</pre>

    flowchart LR
        subgraph The right way
        direction LR  %%
        A[Source Dataset \n e.g. IRIS ]
        A --> B{fa:fa-spinner Random Split}
        B --> C(StandardScaler \nFit) --> |Mean and Std| X(StandardScaler \nTransform) --> Y[Train]
        C--> |Mean and Std| E
        B --> E(StandardScaler \nTransform) --> F[Test]
        C--> |Mean and Std| G
        B --> G(StandardScaler \nTransform) --> H[Validation]
        end

    flowchart LR
        subgraph The wrong way
        direction LR  %%
        A[Source Dataset \n e.g. IRIS] --> B(StandardScaler \nFit & Transform)
        B --> C{fa:fa-spinner Random Split}
        C --> D[Train]
        C --> E[Test]
        C --> F[Validation]
          end

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv')
train_colour = "rgba(52, 152, 219, 1)"
test_colour = "rgba(211, 84, 0, 1)"

fig = make_subplots(rows=1, cols=2, subplot_titles=("Random Split", "Temporal Split"))
fig.add_trace(
    go.Scatter(
        x=df["Date"],
        y=df["mavg"],  
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Scatter(
        x=df["Date"],
        y=df["mavg"],
    ),
    row=1,
    col=2
) 

# add vrects to left plot
fig.add_vrect(
    x0="2015-02-17",
    x1="2015-04-17",
    row=1,
    col=1,
    annotation_text="test",
    annotation_position="top right",
    fillcolor=test_colour,
    opacity=0.33,
    line_width=0
)
fig.add_vrect(
    x0="2015-04-17",
    x1="2015-08-17",
    row=1,
    col=1,
    annotation_text="train",
    annotation_position="top right",
    fillcolor=train_colour,
    opacity=0.33,
    line_width=1
)
fig.add_vrect(
    x0="2015-08-17",
    x1="2015-11-17",
    row=1,
    col=1,
    annotation_text="test",
    annotation_position="top right",
    fillcolor=test_colour,
    opacity=0.33,
    line_width=1
)
fig.add_vrect(
    x0="2015-11-17",
    x1="2016-06-17",
    row=1,
    col=1,
    annotation_text="train",
    annotation_position="top right",
    fillcolor=train_colour,
    opacity=0.25,
    line_width=1
)
fig.add_vrect(
    x0="2016-06-17",
    x1="2016-08-16",
    row=1,
    col=1,
    annotation_text="test",
    annotation_position="top right",
    fillcolor=test_colour,
    opacity=0.33,
    line_width=1
)
fig.add_vrect(
    x0="2016-08-16",
    x1="2017-02-16",
    row=1,
    col=1,
    annotation_text="train",
    annotation_position="top right",
    fillcolor=train_colour,
    opacity=0.33,
    line_width=1
)


# add vrects to right plot
fig.add_vrect(
    x0="2015-02-17",
    x1="2016-08-16",
    row=1,
    col=2,
    annotation_text="train",
    annotation_position="top right",
    fillcolor=train_colour,
    opacity=0.33,
    line_width=1
)
fig.add_vrect(
    x0="2016-08-16",
    x1="2017-02-16",
    row=1,
    col=2,
    annotation_text="test",
    annotation_position="top right",
    fillcolor=test_colour,
    opacity=0.33,
    line_width=1
)

fig.show()

%%HTML

<script type="module">
  import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10.1/dist/mermaid.esm.min.mjs';
  let config = { startOnLoad: true,  securityLevel: 'loose', htmlLabels:true, flowchart: { useMaxWidth: false, htmlLabels: true } };
  mermaid.initialize(config);
    mermaid.run({ htmlLabels:true,
  querySelector: '.mermaid',
});
    
// for live editing, re-render only the text of the current cell
    //window.IPython && $(IPython.events).on("RenderedMarkdown", function(evt, data){
        // this is using a hacked mermaid that accepts some nodes!
    //    mermaid.init(undefined, data.cell.element.find(".mermaid"));
    //});
    
    // let config = { startOnLoad: true, securityLevel: 'loose', flowchart: { useMaxWidth: false, htmlLabels: true } };
  // mermaid.initialize(config);   
    
</script>
<style>
    .mermaid *{font-family: sans-serif; }
    .mermaid .node, .mermaid .cluster{
      fill: white !important;
      stroke: black !important;
      stroke-width: 1px !important;
    }
    .mermaid div{
      text-align: center;
    }
    .mermaid .label{
      color: black;
    }
    .jp-InputArea-editor {
      display: block;
    }
</style>

⚠️ Warning: This page is still in a draft state. Information may be incorrect or incomplete and move around the page ⚠️¶

Preface¶

OPAI-1: Information Leakage¶

OPAI-1.1: Data collection¶

OPAI-1.1.1: Process artefacts¶

OPAI-1.1.2: Duplicate data¶

OPAI-1.1.3: Point in time correctness¶

OPAI-1.2: Dataset Preperation¶

OPAI-1.2.1: Direct leakage / Leaky targets¶

OPAI-1.2.2: Data Cleaning¶

OPAI-1.2.3: Feature engineering¶

OPAI-1.2.4: Data augmentation¶

OPAI-1.2.5: Sampling¶

OPAI-1.2.6: Scaling data / Data transforms¶

OPAI-1.3: Incorrect use of shuffle¶

OPAI-1.3.1: Temporal leakage through shuffling timeseries data¶

Conclusion¶

Contributing¶

Glossary¶