askill
data-ai-skills

data-ai-skillsSafety 90Repository

Master machine learning, data engineering, AI engineering, LLMs, prompt engineering, and MLOps. Build intelligent systems with Python.

0 stars
1.2k downloads
Updated 2/5/2026

Package Files

Loading files...
SKILL.md

Data & AI Engineering Skills

Python Fundamentals

import numpy as np
import pandas as pd
from typing import Optional, List

# NumPy arrays with type hints
def process_array(arr: np.ndarray) -> np.ndarray:
    """Process numpy array with validation."""
    if arr.size == 0:
        raise ValueError("Empty array not allowed")
    return np.clip(arr, 0, 1)

# Pandas DataFrames with error handling
def load_data(path: str) -> pd.DataFrame:
    """Load data with validation."""
    try:
        df = pd.read_csv(path)
        if df.empty:
            raise ValueError(f"Empty dataset: {path}")
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Data file not found: {path}")

Machine Learning with Scikit-Learn

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import logging

logger = logging.getLogger(__name__)

def train_model(X, y, n_estimators=100, random_state=42):
    """Train model with logging and validation."""
    logger.info(f"Training with {len(X)} samples")

    # Validate input
    if len(X) != len(y):
        raise ValueError("X and y must have same length")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )

    # Train
    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    logger.info(f"Accuracy: {report['accuracy']:.4f}")
    return model, report

Deep Learning with PyTorch

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class SimpleNet(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return torch.sigmoid(self.fc2(x))

def train_epoch(model, dataloader, optimizer, loss_fn, device):
    """Train one epoch with gradient clipping."""
    model.train()
    total_loss = 0

    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        output = model(batch_x)
        loss = loss_fn(output, batch_y)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

LLM Applications (2025 Best Practices)

from anthropic import Anthropic
import backoff

client = Anthropic()

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def query_llm(prompt: str, max_tokens: int = 1024) -> str:
    """Query LLM with retry logic."""
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=max_tokens,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

# Structured output
def extract_entities(text: str) -> dict:
    """Extract entities with validation."""
    prompt = f"""Extract entities from the text.
    Return JSON with keys: persons, organizations, locations.

    Text: {text}
    """
    response = query_llm(prompt)
    return json.loads(response)

Data Pipeline with Checkpointing

import os
from pathlib import Path

def process_with_checkpoint(
    df: pd.DataFrame,
    checkpoint_dir: str,
    step_name: str
) -> pd.DataFrame:
    """Process with checkpoint recovery."""
    checkpoint_path = Path(checkpoint_dir) / f"{step_name}.parquet"

    if checkpoint_path.exists():
        print(f"Loading checkpoint: {step_name}")
        return pd.read_parquet(checkpoint_path)

    # Process
    result = expensive_transform(df)

    # Save checkpoint
    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
    result.to_parquet(checkpoint_path)

    return result

MLOps with MLflow

import mlflow
import mlflow.sklearn

def train_with_tracking(X_train, y_train, X_test, y_test, params):
    """Train with experiment tracking."""
    mlflow.set_experiment("model-training")

    with mlflow.start_run():
        # Log parameters
        mlflow.log_params(params)

        # Train
        model = RandomForestClassifier(**params)
        model.fit(X_train, y_train)

        # Evaluate
        accuracy = model.score(X_test, y_test)
        mlflow.log_metric("accuracy", accuracy)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        return model, accuracy

Model Evaluation Metrics

TaskPrimary MetricSecondary Metrics
ClassificationAccuracy, F1Precision, Recall, AUC
RegressionRMSE, MAER², MAPE
RankingNDCGMRR, MAP
ClusteringSilhouetteDavies-Bouldin

Unit Test Template

import pytest
import numpy as np
from your_module import train_model, process_array

class TestMLFunctions:
    @pytest.fixture
    def sample_data(self):
        X = np.random.randn(100, 10)
        y = np.random.randint(0, 2, 100)
        return X, y

    def test_train_model(self, sample_data):
        X, y = sample_data
        model, report = train_model(X, y)

        assert model is not None
        assert 'accuracy' in report
        assert 0 <= report['accuracy'] <= 1

    def test_process_array_empty(self):
        with pytest.raises(ValueError, match="Empty array"):
            process_array(np.array([]))

    def test_process_array_clips(self):
        arr = np.array([−1, 0.5, 2])
        result = process_array(arr)
        assert result.min() >= 0
        assert result.max() <= 1

Troubleshooting Guide

SymptomCauseSolution
CUDA OOMBatch too largeReduce batch, use gradient accumulation
Loss NaNLearning rate too highReduce LR, add gradient clipping
OverfittingModel too complexAdd regularization, more data
Slow trainingI/O bottleneckUse DataLoader workers

Key Concepts Checklist

  • Python basics and NumPy
  • Pandas data manipulation
  • Data visualization (Matplotlib, Seaborn)
  • Supervised learning (classification, regression)
  • Unsupervised learning (clustering)
  • Feature engineering
  • Model evaluation and metrics
  • Hyperparameter tuning
  • Cross-validation
  • Handling imbalanced data
  • Deep learning basics
  • LLM integration
  • Prompt engineering
  • MLOps pipeline setup

Source: https://roadmap.sh Version: 2.0.0 Last Updated: 2025-01-01

Install

Download ZIP
Requires askill CLI v1.0+

AI Quality Score

95/100Analyzed 2/10/2026

A comprehensive and highly actionable skill covering the full spectrum of Data and AI engineering, from basic Python data processing to LLM integration and MLOps.

90
95
90
95
95

Metadata

Licenseunknown
Version2.0.0
Updated2/5/2026
Publishermajiayu000

Tags

ci-cdllmobservabilitypromptingtesting